Imports¶
(Python 3.11.5)
import glob
import numpy as np
import math
import os
import pandas as pd
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt
from collections import Counter
import datetime
import missingno as msno
import datetime
import pylab as py
import statsmodels.api as sm
import statsmodels.stats as sm_stats
import statsmodels.stats.api as sms
import scipy.stats as stats
from itertools import combinations
from scipy.stats import mannwhitneyu
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn import tree
EDA s vizualizáciou¶
Peter Bartoš
Analýza súborov¶
filename = "077/session.csv"
sessions = pd.read_csv(filename, sep='\t')
sessions.head()
| product_ean | scroll_move_total_rel_distance | wild_mouse_duration | pct_scroll_move_duration | session_duration | pct_click | total_load_time | pct_scroll_move | screen_height | page_activity_duration | ... | ack | pct_click_product_info | pct_doubleclick | pct_mouse_move | mouse_move_total_rel_distance | session_id | pct_rage_click | user_id | browser_name | session_start | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 8950762435157 | 16.21104 | 12.66847 | 9.41905 | 208.87043 | 8.99440 | 3.70117 | 48.80969 | 768 | 9.28657 | ... | 0.0 | 53.52661 | 10.44855 | 11.77485 | 10.79176 | 132238 | 18.03466 | 1396 | edge | 2022-04-21 08:50:40 |
| 1 | 2579057139158 | 11.76366 | 13.33927 | 11.61679 | 241.51168 | 14.64515 | 6.16807 | 47.62969 | 600 | 14.73176 | ... | 1.0 | 51.22887 | 8.03629 | 10.83529 | 12.18790 | 664702 | 44.15461 | 1328 | safari | 2019-11-25 19:58:57 |
| 2 | 3390494344030 | 13.48294 | 12.35923 | 12.48669 | 132.10373 | 9.27274 | 1.63482 | 62.60031 | 1024 | 10.34849 | ... | 0.0 | 59.20266 | 11.78886 | 12.29992 | 13.21679 | 980757 | 57.85579 | 1945 | firefox | 2020-11-24 13:03:47 |
| 3 | 9429614155677 | 14.58238 | 13.97482 | 11.74635 | 257.63919 | 9.10442 | 6.45019 | 48.75305 | 600 | 14.86722 | ... | 0.0 | 59.38880 | 13.76545 | 12.04421 | 10.82077 | 1427449 | 47.29214 | 1897 | opera | 2019-09-23 17:02:23 |
| 4 | 7388234028469 | 10.57370 | 16.75585 | 12.83383 | 135.17702 | 11.71659 | 2.85352 | 54.27370 | lower | 10.07783 | ... | 0.0 | 47.80334 | 8.11174 | 11.64357 | 16.43110 | 1384343 | 44.87688 | 1257 | mobile | 2022-12-30 04:49:36 |
5 rows × 25 columns
sessions.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 11104 entries, 0 to 11103 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 product_ean 11104 non-null int64 1 scroll_move_total_rel_distance 11093 non-null float64 2 wild_mouse_duration 11093 non-null float64 3 pct_scroll_move_duration 11093 non-null float64 4 session_duration 11104 non-null float64 5 pct_click 11092 non-null float64 6 total_load_time 11104 non-null float64 7 pct_scroll_move 11092 non-null float64 8 screen_height 11104 non-null object 9 page_activity_duration 11093 non-null float64 10 screen_width 11104 non-null object 11 pct_wild_mouse 11093 non-null float64 12 pct_input 11093 non-null float64 13 pct_mouse_click 11093 non-null float64 14 pct_scrandom 11093 non-null float64 15 ack 11104 non-null float64 16 pct_click_product_info 11092 non-null float64 17 pct_doubleclick 11093 non-null float64 18 pct_mouse_move 11093 non-null float64 19 mouse_move_total_rel_distance 11093 non-null float64 20 session_id 11104 non-null int64 21 pct_rage_click 11093 non-null float64 22 user_id 11104 non-null int64 23 browser_name 11104 non-null object 24 session_start 11104 non-null object dtypes: float64(18), int64(3), object(4) memory usage: 2.1+ MB
filename = "077/product.csv"
products = pd.read_csv(filename, sep='\t')
products.head()
| code | store_name | location | product_ean | |
|---|---|---|---|---|
| 0 | PK | Bhit Shah | Asia/Karachi | 6323711260827 |
| 1 | MM | Kyaikto | Asia/Yangon | 8722236798279 |
| 2 | CN | Jiangyan | Asia/Shanghai | 7371273897806 |
| 3 | MX | Oaxaca | America/Mexico_City | 3758741185803 |
| 4 | IN | Moga | Asia/Kolkata | 3698132657390 |
products.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 14972 entries, 0 to 14971 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 code 14955 non-null object 1 store_name 14972 non-null object 2 location 14972 non-null object 3 product_ean 14972 non-null int64 dtypes: int64(1), object(3) memory usage: 468.0+ KB
filename = "077/user.csv"
users = pd.read_csv(filename, sep='\t')
users.head()
| user_id | name | job | sex | residence | address | birthdate | username | current_location | race | ||
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 574 | 田中 拓真 | 医療事務員 | M | NaN | 栃木県いすみ市皇居外苑23丁目17番6号 | 1955-07-26 | yamamotoasuka@gmail.com | saitonaoto | NaN | NaN |
| 1 | 459 | 小川 知実 | NaN | NaN | 佐賀県町田市松が谷1丁目15番9号 クレスト戸島532 | 山口県我孫子市二つ室32丁目24番8号 | 1974-06-04 | mogawa@hotmail.com | yasuhiro04 | (Decimal('-38.8782895'), Decimal('-76.073506')) | unspecified |
| 2 | 649 | Martino Baracca-Baresi | NaN | NaN | Viale Alphons, 2 Piano 7\n95015, Linguaglossa ... | NaN | 2010-09-29 | elena78@live.com | gmascagni | (Decimal('-26.4396325'), Decimal('-97.080942')) | NaN |
| 3 | 1927 | 小林 明美 | NaN | F | 熊本県三鷹市太田ヶ谷3丁目21番10号 パレス東上野718 | 石川県小笠原村東三島6丁目4番9号 ハイツ天神島660 | NaN | kimurashohei@gmail.com | yoshidakumiko | (Decimal('10.623780'), Decimal('-105.985189')) | NaN |
| 4 | 474 | Pavlína Štěpánková | NaN | NaN | Mukařovského 51\n299 27 Horní Cerekev | Lešovská 28\n381 21 Trhový Štěpánov | 2016-06-18 | polakviktor@seznam.cz | zemanovakaterina | (Decimal('-31.104063'), Decimal('-169.943227')) | NaN |
users.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2528 entries, 0 to 2527 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 user_id 2528 non-null int64 1 name 2528 non-null object 2 job 1011 non-null object 3 sex 1264 non-null object 4 residence 885 non-null object 5 address 2149 non-null object 6 birthdate 1390 non-null object 7 mail 2528 non-null object 8 username 2528 non-null object 9 current_location 2402 non-null object 10 race 758 non-null object dtypes: int64(1), object(10) memory usage: 217.4+ KB
Dáta obsahujú 3 CSV súbory, ktoré majú záznamy oddelené tabulátorom (TAB). Prvý súbor obsahuje user sessions a ich náležitosti. Druhý súbor obsahuje produkty k jednotlivým product_ean zo sessions. Tretí súbor obsahuje používateľov k jednotlivým user_id zo sessions. Potencionálne by sa dali tieto súbory spojiť do jedného.
Analýza záznamov¶
sessions.shape
(11104, 25)
sessions.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 11104 entries, 0 to 11103 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 product_ean 11104 non-null int64 1 scroll_move_total_rel_distance 11093 non-null float64 2 wild_mouse_duration 11093 non-null float64 3 pct_scroll_move_duration 11093 non-null float64 4 session_duration 11104 non-null float64 5 pct_click 11092 non-null float64 6 total_load_time 11104 non-null float64 7 pct_scroll_move 11092 non-null float64 8 screen_height 11104 non-null object 9 page_activity_duration 11093 non-null float64 10 screen_width 11104 non-null object 11 pct_wild_mouse 11093 non-null float64 12 pct_input 11093 non-null float64 13 pct_mouse_click 11093 non-null float64 14 pct_scrandom 11093 non-null float64 15 ack 11104 non-null float64 16 pct_click_product_info 11092 non-null float64 17 pct_doubleclick 11093 non-null float64 18 pct_mouse_move 11093 non-null float64 19 mouse_move_total_rel_distance 11093 non-null float64 20 session_id 11104 non-null int64 21 pct_rage_click 11093 non-null float64 22 user_id 11104 non-null int64 23 browser_name 11104 non-null object 24 session_start 11104 non-null object dtypes: float64(18), int64(3), object(4) memory usage: 2.1+ MB
sessions.describe
<bound method NDFrame.describe of product_ean scroll_move_total_rel_distance wild_mouse_duration \
0 8950762435157 16.21104 12.66847
1 2579057139158 11.76366 13.33927
2 3390494344030 13.48294 12.35923
3 9429614155677 14.58238 13.97482
4 7388234028469 10.57370 16.75585
... ... ... ...
11099 6249841194873 9.71781 12.54425
11100 632137138686 10.17333 13.59890
11101 7807699785675 12.24116 9.74016
11102 4520813737208 12.78939 9.78510
11103 4162232777634 13.32364 13.44029
pct_scroll_move_duration session_duration pct_click total_load_time \
0 9.41905 208.87043 8.99440 3.70117
1 11.61679 241.51168 14.64515 6.16807
2 12.48669 132.10373 9.27274 1.63482
3 11.74635 257.63919 9.10442 6.45019
4 12.83383 135.17702 11.71659 2.85352
... ... ... ... ...
11099 11.69294 223.58494 13.14252 7.06971
11100 10.96911 53.77463 9.44677 3.73454
11101 13.86946 120.52246 10.29812 4.25884
11102 11.08682 113.49210 10.81719 8.42319
11103 11.22500 160.10786 9.55937 4.72532
pct_scroll_move screen_height page_activity_duration ... ack \
0 48.80969 768 9.28657 ... 0.0
1 47.62969 600 14.73176 ... 1.0
2 62.60031 1024 10.34849 ... 0.0
3 48.75305 600 14.86722 ... 0.0
4 54.27370 lower 10.07783 ... 0.0
... ... ... ... ... ...
11099 54.63038 768 12.54483 ... 0.0
11100 51.40161 1080 9.64060 ... 0.0
11101 56.29248 1080 11.42249 ... 1.0
11102 81.74853 lower 9.60514 ... 1.0
11103 52.59706 1080 12.54013 ... 0.0
pct_click_product_info pct_doubleclick pct_mouse_move \
0 53.52661 10.44855 11.77485
1 51.22887 8.03629 10.83529
2 59.20266 11.78886 12.29992
3 59.38880 13.76545 12.04421
4 47.80334 8.11174 11.64357
... ... ... ...
11099 58.25138 16.13722 12.39076
11100 66.39444 12.29469 11.15374
11101 38.05104 12.06345 11.71504
11102 19.65066 8.02148 10.30349
11103 38.37169 13.87762 12.14628
mouse_move_total_rel_distance session_id pct_rage_click user_id \
0 10.79176 132238 18.03466 1396
1 12.18790 664702 44.15461 1328
2 13.21679 980757 57.85579 1945
3 10.82077 1427449 47.29214 1897
4 16.43110 1384343 44.87688 1257
... ... ... ... ...
11099 6.94132 208781 71.00232 806
11100 15.98719 5986 67.77192 1879
11101 11.43620 562419 58.85636 2269
11102 14.53820 469797 39.83890 876
11103 9.26288 778165 44.47783 109
browser_name session_start
0 edge 2022-04-21 08:50:40
1 safari 2019-11-25 19:58:57
2 firefox 2020-11-24 13:03:47
3 opera 2019-09-23 17:02:23
4 mobile 2022-12-30 04:49:36
... ... ...
11099 opera 2019-04-09 06:27:18
11100 safari 2023-04-02 14:58:00
11101 chrome 2021-08-30 18:39:01
11102 edge 2022-08-26 13:10:55
11103 firefox 2021-11-22 00:51:59
[11104 rows x 25 columns]>
products.shape
(14972, 4)
products.describe
<bound method NDFrame.describe of code store_name location product_ean 0 PK Bhit Shah Asia/Karachi 6323711260827 1 MM Kyaikto Asia/Yangon 8722236798279 2 CN Jiangyan Asia/Shanghai 7371273897806 3 MX Oaxaca America/Mexico_City 3758741185803 4 IN Moga Asia/Kolkata 3698132657390 ... ... ... ... ... 14967 DE Eppingen Europe/Berlin 745923047489 14968 PH Santa Catalina Asia/Manila 6751093666317 14969 BR Canavieiras America/Bahia 2677236133284 14970 DE Siegburg Europe/Berlin 663162895183 14971 PH Baliuag Asia/Manila 8354961680842 [14972 rows x 4 columns]>
users.shape
(2528, 11)
users.describe
<bound method NDFrame.describe of user_id name job sex \
0 574 田中 拓真 医療事務員 M
1 459 小川 知実 NaN NaN
2 649 Martino Baracca-Baresi NaN NaN
3 1927 小林 明美 NaN F
4 474 Pavlína Štěpánková NaN NaN
... ... ... ... ...
2523 332 Lucas Gabriel Nascimento NaN NaN
2524 2460 Dott. Osvaldo Ferrara Conservation officer, nature M
2525 991 Marcos Vinicius da Cunha NaN M
2526 1407 Gelsomina Accardo NaN F
2527 934 Lina Fagiani NaN NaN
residence \
0 NaN
1 佐賀県町田市松が谷1丁目15番9号 クレスト戸島532
2 Viale Alphons, 2 Piano 7\n95015, Linguaglossa ...
3 熊本県三鷹市太田ヶ谷3丁目21番10号 パレス東上野718
4 Mukařovského 51\n299 27 Horní Cerekev
... ...
2523 NaN
2524 NaN
2525 NaN
2526 Contrada Branciforte, 8\n84025, Corno D'Oro (SA)
2527 NaN
address birthdate \
0 栃木県いすみ市皇居外苑23丁目17番6号 1955-07-26
1 山口県我孫子市二つ室32丁目24番8号 1974-06-04
2 NaN 2010-09-29
3 石川県小笠原村東三島6丁目4番9号 ハイツ天神島660 NaN
4 Lešovská 28\n381 21 Trhový Štěpánov 2016-06-18
... ... ...
2523 NaN 1914-02-12
2524 Viale Tamburi, 291 Appartamento 9\n33092, Cava... 1988-08-10
2525 Via Gabriela Oliveira, 42\nConjunto Novo Dom B... NaN
2526 Piazza Murialdo, 787\n15066, Gavi (AL) 1998-10-31
2527 Rotonda Crisafulli, 9 Appartamento 2\n25054, M... 2012-12-31
mail username \
0 yamamotoasuka@gmail.com saitonaoto
1 mogawa@hotmail.com yasuhiro04
2 elena78@live.com gmascagni
3 kimurashohei@gmail.com yoshidakumiko
4 polakviktor@seznam.cz zemanovakaterina
... ... ...
2523 zaragao@hotmail.com fogacalaura
2524 cdibiasi@libero.it cesare39
2525 xsilva@hotmail.com joao-vitor45
2526 carusorosalia@gmail.com lombrosoornella
2527 dfantozzi@tin.it irmagramsci
current_location race
0 NaN NaN
1 (Decimal('-38.8782895'), Decimal('-76.073506')) unspecified
2 (Decimal('-26.4396325'), Decimal('-97.080942')) NaN
3 (Decimal('10.623780'), Decimal('-105.985189')) NaN
4 (Decimal('-31.104063'), Decimal('-169.943227')) NaN
... ... ...
2523 (Decimal('45.8402615'), Decimal('97.514645')) NaN
2524 (Decimal('-14.686722'), Decimal('-47.922953')) unspecified
2525 (Decimal('-49.108141'), Decimal('49.593175')) asian
2526 (Decimal('-28.966979'), Decimal('140.934091')) NaN
2527 (Decimal('-27.2510065'), Decimal('-90.637333')) NaN
[2528 rows x 11 columns]>
Postrehy z tejto časti: veľa NaN a prázdnych polí, hodilo by sa nejako rozumnejšie ukladať current_location v users, v sessions v stĺpci screen_height sú aj stringy ("lower").
Analýza jednotlivých atribútov¶
Zaujímavý atribút je aj "sex" v users, ktorý nám vie hovoriť, že aké pohlavie sa uchýlilo k nákupu skorej alebo častejšie.¶
users["sex"].describe()
count 1264 unique 2 top M freq 632 Name: sex, dtype: object
count = Counter(users["sex"])
df = pd.DataFrame(count, index=["sex"])
ax = df.plot.bar(rot=0, edgecolor="white", linewidth=5)
Atribút "race" v users, ktorý nám vie dať perspektívu ohľadom rôznych rás.¶
users["race"].describe()
count 758 unique 5 top unspecified freq 243 Name: race, dtype: object
count = Counter(users["race"].dropna()) # bez NaN hodnôt
df = pd.DataFrame(count, index=["race"])
ax = df.plot.bar(rot=0,edgecolor="white", linewidth=5)
"browser_name" atríbút je zaujímavý atribút kvôli poukázaní obľúbenosti rôznych prehliadačov pre poumžívateľov¶
sessions["browser_name"].describe()
count 11104 unique 7 top chrome freq 3362 Name: browser_name, dtype: object
count = Counter(sessions["browser_name"])
df = sessions.from_dict(count, orient="index")
df.plot(kind="bar",color="g")
<Axes: >
"session_start" atribút nám vie povedať, že kedy ľudia začali session, čo môže dávať zaujímavé štatistiky.¶
sessions["session_start"].describe()
count 11104 unique 10971 top 2021-06-25 20:27:15 freq 2 Name: session_start, dtype: object
session_starts = pd.to_datetime(sessions["session_start"])
timestamps_session_starts = session_starts.astype('int64').astype('int32')
median = timestamps_session_starts.median()
median = datetime.datetime.fromtimestamp(int(median))
str(median)
'1970-04-12 22:22:24'
mean = timestamps_session_starts.mean()
mean
-1293264.4149855908
stats.mode(timestamps_session_starts)
ModeResult(mode=-2090569728, count=2)
var = np.var(timestamps_session_starts)
var
1.5375620514701028e+18
std = np.std(timestamps_session_starts)
std = datetime.datetime.fromtimestamp(int(std))
str(std)
'2009-04-17 18:11:38'
sessions.head()
| product_ean | scroll_move_total_rel_distance | wild_mouse_duration | pct_scroll_move_duration | session_duration | pct_click | total_load_time | pct_scroll_move | screen_height | page_activity_duration | ... | ack | pct_click_product_info | pct_doubleclick | pct_mouse_move | mouse_move_total_rel_distance | session_id | pct_rage_click | user_id | browser_name | session_start | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 8950762435157 | 16.21104 | 12.66847 | 9.41905 | 208.87043 | 8.99440 | 3.70117 | 48.80969 | 768 | 9.28657 | ... | 0.0 | 53.52661 | 10.44855 | 11.77485 | 10.79176 | 132238 | 18.03466 | 1396 | edge | 2022-04-21 08:50:40 |
| 1 | 2579057139158 | 11.76366 | 13.33927 | 11.61679 | 241.51168 | 14.64515 | 6.16807 | 47.62969 | 600 | 14.73176 | ... | 1.0 | 51.22887 | 8.03629 | 10.83529 | 12.18790 | 664702 | 44.15461 | 1328 | safari | 2019-11-25 19:58:57 |
| 2 | 3390494344030 | 13.48294 | 12.35923 | 12.48669 | 132.10373 | 9.27274 | 1.63482 | 62.60031 | 1024 | 10.34849 | ... | 0.0 | 59.20266 | 11.78886 | 12.29992 | 13.21679 | 980757 | 57.85579 | 1945 | firefox | 2020-11-24 13:03:47 |
| 3 | 9429614155677 | 14.58238 | 13.97482 | 11.74635 | 257.63919 | 9.10442 | 6.45019 | 48.75305 | 600 | 14.86722 | ... | 0.0 | 59.38880 | 13.76545 | 12.04421 | 10.82077 | 1427449 | 47.29214 | 1897 | opera | 2019-09-23 17:02:23 |
| 4 | 7388234028469 | 10.57370 | 16.75585 | 12.83383 | 135.17702 | 11.71659 | 2.85352 | 54.27370 | lower | 10.07783 | ... | 0.0 | 47.80334 | 8.11174 | 11.64357 | 16.43110 | 1384343 | 44.87688 | 1257 | mobile | 2022-12-30 04:49:36 |
5 rows × 25 columns
"session_duration" atribút nám poskytne štatistiky ohľadom session trvania.¶
sessions["session_duration"].head(5)
0 208.87043 1 241.51168 2 132.10373 3 257.63919 4 135.17702 Name: session_duration, dtype: float64
sessions["session_duration"].describe()
count 11104.000000 mean 150.148311 std 60.500269 min -76.356580 25% 108.978155 50% 149.830650 75% 190.843590 max 387.564200 Name: session_duration, dtype: float64
sessions["session_duration"].median()
149.83065
np.var(sessions["session_duration"])
3659.9529354342403
sns.histplot(sessions["session_duration"],bins=100)
<Axes: xlabel='session_duration', ylabel='Count'>
"total_load_time" atribút nám poskytne štatistiky ohľadom celkové času načítavania session-u.¶
sessions["total_load_time"].head(5)
0 3.70117 1 6.16807 2 1.63482 3 6.45019 4 2.85352 Name: total_load_time, dtype: float64
sessions["total_load_time"].describe()
count 11104.000000 mean 3.995133 std 2.831194 min 0.059470 25% 1.909507 50% 3.349530 75% 5.386770 max 23.596430 Name: total_load_time, dtype: float64
sessions["total_load_time"].median()
3.34953
np.var(sessions["total_load_time"])
8.014938329182991
sns.histplot(sessions["total_load_time"],bins=100)
<Axes: xlabel='total_load_time', ylabel='Count'>
"page_activity_duration" nám dá čas dĺžky aktivity používateľa. Môže sa kombinovať s rôznymi dátami a na základe toho vyvodzovať rôzne výstupy.¶
sessions["page_activity_duration"].head(5)
0 9.28657 1 14.73176 2 10.34849 3 14.86722 4 10.07783 Name: page_activity_duration, dtype: float64
sessions["page_activity_duration"].describe()
count 11093.000000 mean 11.718873 std 2.192336 min 3.889690 25% 9.993110 50% 11.861420 75% 13.478040 max 18.055620 Name: page_activity_duration, dtype: float64
sessions["page_activity_duration"].median()
11.86142
np.var(sessions["page_activity_duration"])
4.805901803157859
sns.histplot(sessions["page_activity_duration"],bins=100)
<Axes: xlabel='page_activity_duration', ylabel='Count'>
Atribút "pct_scroll_move_duration" hovorí o celkovom čase scrollovania myši používateľom. Na základe dĺžky tohto času scrollovania, sa môže prísť k rôznym záverom, napr. čím dlhšie používateľ skroloval, tak tým mal aj dlhší session duration.¶
sessions["pct_scroll_move_duration"].head()
0 9.41905 1 11.61679 2 12.48669 3 11.74635 4 12.83383 Name: pct_scroll_move_duration, dtype: float64
sessions["pct_scroll_move_duration"].describe()
count 11093.000000 mean 11.712817 std 1.001441 min 7.894690 25% 11.047090 50% 11.715270 75% 12.374940 max 15.786200 Name: pct_scroll_move_duration, dtype: float64
sessions["pct_scroll_move_duration"].median()
11.71527
np.var(sessions["pct_scroll_move_duration"])
1.0027943573338343
sns.histplot(sessions["pct_scroll_move_duration"],bins=100)
<Axes: xlabel='pct_scroll_move_duration', ylabel='Count'>
"wild_mouse_duration" obsahuje dĺžky trvania rýchleho pohybu myši. Umožňuje rôzne perspektívy voči ostatným dátam a môže obohatiť výsledky výstupov.¶
sessions["wild_mouse_duration"].head()
0 12.66847 1 13.33927 2 12.35923 3 13.97482 4 16.75585 Name: wild_mouse_duration, dtype: float64
sessions["wild_mouse_duration"].describe()
count 11093.000000 mean 12.504114 std 2.032968 min 6.553110 25% 10.869790 50% 12.754170 75% 13.995960 max 19.351740 Name: wild_mouse_duration, dtype: float64
sessions["wild_mouse_duration"].median()
12.75417
np.var(sessions["wild_mouse_duration"])
4.13258679866344
sns.histplot(sessions["wild_mouse_duration"],bins=100)
<Axes: xlabel='wild_mouse_duration', ylabel='Count'>
"mail" z users môže ponúknuť podrobnejší náhľad do analýzy. Môže ponúknuť pohľad na to, že ktorá do doména je najzaužívanejšia.¶
users["mail"].head()
0 yamamotoasuka@gmail.com 1 mogawa@hotmail.com 2 elena78@live.com 3 kimurashohei@gmail.com 4 polakviktor@seznam.cz Name: mail, dtype: object
users["mail"].describe()
count 2528 unique 2522 top christopher35@yahoo.com freq 2 Name: mail, dtype: object
domains = pd.Series(users["mail"]).str.split("@").str[1].str.split(".").str[0]
domains.describe()
count 2528 unique 28 top gmail freq 495 Name: mail, dtype: object
count = Counter(domains)
df = pd.DataFrame(count, index=["domain"])
ax = df.plot.bar(rot=0, edgecolor="white", linewidth=5)
Párová analýza dát¶
Vykreslíme heatmapu na identifikáciu nejakých korelácií:¶
sessions_ = sessions.copy()
sessions_.loc[sessions["screen_width"] == "lower", "screen_width"] = 100
sessions_.loc[sessions["screen_width"] == "higher", "screen_width"] = 4000
sessions_.loc[sessions["screen_height"] == "lower", "screen_height"] = 200
sessions_.loc[sessions["screen_height"] == "higher", "screen_height"] = 2000
sessions_.drop(columns=["browser_name", "session_start"], inplace=True)
figure, ax = plt.subplots(figsize=(20,16))
sns.heatmap(sessions_.corr(),ax=ax, annot =True, fmt = ".3f")
<Axes: >
sns.pairplot(sessions_, vars=["pct_doubleclick", "pct_mouse_click", "mouse_move_total_rel_distance", "pct_click", "scroll_move_total_rel_distance"], hue="ack", diag_kind="kde")
<seaborn.axisgrid.PairGrid at 0x1b8f9a22050>
fig = plt.figure(figsize = (15,20))
ax = fig.gca()
sessions_.drop(columns=["product_ean", "ack", "session_id", "user_id"], inplace=True)
sessions_.hist(ax = ax)
C:\Users\peter\AppData\Local\Temp\ipykernel_34140\1277887577.py:4: UserWarning: To output multiple subplots, the figure containing the passed axes is being cleared. sessions_.hist(ax = ax)
array([[<Axes: title={'center': 'scroll_move_total_rel_distance'}>,
<Axes: title={'center': 'wild_mouse_duration'}>,
<Axes: title={'center': 'pct_scroll_move_duration'}>,
<Axes: title={'center': 'session_duration'}>],
[<Axes: title={'center': 'pct_click'}>,
<Axes: title={'center': 'total_load_time'}>,
<Axes: title={'center': 'pct_scroll_move'}>,
<Axes: title={'center': 'page_activity_duration'}>],
[<Axes: title={'center': 'pct_wild_mouse'}>,
<Axes: title={'center': 'pct_input'}>,
<Axes: title={'center': 'pct_mouse_click'}>,
<Axes: title={'center': 'pct_scrandom'}>],
[<Axes: title={'center': 'pct_click_product_info'}>,
<Axes: title={'center': 'pct_doubleclick'}>,
<Axes: title={'center': 'pct_mouse_move'}>,
<Axes: title={'center': 'mouse_move_total_rel_distance'}>],
[<Axes: title={'center': 'pct_rage_click'}>, <Axes: >, <Axes: >,
<Axes: >]], dtype=object)
Z tochto sme sa dozvedeli, že "ack","pct_mouse_click" a "pct_mouse_click","pct_doubleclick" majú najvýraznejšie korelácie. Ďalej je tam "mouse_move_total_rel_distance" a "pct_doubleclick". Zameriame sa ale na "pct_click","scroll_move_total_rel_distance" a "pct_mouse_click", "pct_doubleclick".
"pct_click" a "scroll_move_total_rel_distance"¶
"scroll_move_total_rel_distance"¶
sessions["scroll_move_total_rel_distance"].describe()
count 11093.000000 mean 12.487878 std 2.202834 min 5.362100 25% 10.827810 50% 12.596650 75% 14.112690 max 20.561370 Name: scroll_move_total_rel_distance, dtype: float64
sessions["scroll_move_total_rel_distance"].value_counts()
scroll_move_total_rel_distance
13.73166 3
13.45822 3
14.13396 3
12.47744 2
9.21988 2
..
9.65940 1
11.79636 1
10.77352 1
8.14599 1
13.32364 1
Name: count, Length: 10898, dtype: int64
sns.histplot(sessions["scroll_move_total_rel_distance"], bins=100)
<Axes: xlabel='scroll_move_total_rel_distance', ylabel='Count'>
sns.boxplot(y=sessions["scroll_move_total_rel_distance"])
<Axes: ylabel='scroll_move_total_rel_distance'>
"pct_click"¶
sessions["pct_click"].describe()
count 11092.000000 mean 10.971727 std 2.036769 min 5.705560 25% 9.595955 50% 10.469120 75% 11.908938 max 19.581860 Name: pct_click, dtype: float64
sessions["pct_click"].value_counts()
pct_click
10.13820 3
8.61142 3
15.70534 2
8.65411 2
10.11221 2
..
10.71871 1
14.24953 1
10.61140 1
8.47760 1
9.55937 1
Name: count, Length: 10866, dtype: int64
sns.histplot(sessions["pct_click"], bins=100)
<Axes: xlabel='pct_click', ylabel='Count'>
sns.boxplot(y=sessions["pct_click"])
<Axes: ylabel='pct_click'>
Korelácia¶
sns.scatterplot(data=sessions, x="scroll_move_total_rel_distance", y="pct_click")
<Axes: xlabel='scroll_move_total_rel_distance', ylabel='pct_click'>
sns.regplot(data=sessions, x="scroll_move_total_rel_distance", y="pct_click")
<Axes: xlabel='scroll_move_total_rel_distance', ylabel='pct_click'>
Graf vyjadruje koreláciu medzi "pct_click" a "scroll_move_total_rel_distance".
sessions["scroll_move_total_rel_distance"].corr(sessions["pct_click"])
-0.5695362318337306
"pct_mouse_click" a "pct_doubleclick"¶
"pct_mouse_click"¶
sessions["pct_mouse_click"].describe()
count 11093.000000 mean 11.575115 std 2.232086 min 4.713520 25% 9.850220 50% 11.522390 75% 13.330950 max 19.808560 Name: pct_mouse_click, dtype: float64
sessions["pct_mouse_click"].value_counts()
pct_mouse_click
12.62643 3
12.11899 2
14.15521 2
10.36686 2
16.40696 2
..
13.71954 1
14.79558 1
11.19086 1
8.84903 1
13.42017 1
Name: count, Length: 10894, dtype: int64
sns.histplot(sessions["pct_mouse_click"], bins=100)
<Axes: xlabel='pct_mouse_click', ylabel='Count'>
sns.boxplot(y=sessions["pct_mouse_click"])
<Axes: ylabel='pct_mouse_click'>
"pct_doubleclick"¶
sessions["pct_doubleclick"].describe()
count 11093.000000 mean 10.843130 std 2.549654 min 0.000000 25% 9.181720 50% 10.771320 75% 12.508670 max 19.635710 Name: pct_doubleclick, dtype: float64
sessions["pct_doubleclick"].value_counts()
pct_doubleclick
12.14482 3
8.64674 2
17.97927 2
14.00654 2
4.58874 2
..
8.20385 1
10.31049 1
13.07989 1
11.36072 1
13.87762 1
Name: count, Length: 10905, dtype: int64
sns.histplot(sessions["pct_doubleclick"], bins=100)
<Axes: xlabel='pct_doubleclick', ylabel='Count'>
sns.boxplot(y=sessions["pct_doubleclick"])
<Axes: ylabel='pct_doubleclick'>
Korelácia¶
sns.scatterplot(data=sessions, x="pct_mouse_click", y="pct_doubleclick")
<Axes: xlabel='pct_mouse_click', ylabel='pct_doubleclick'>
sns.regplot(data=sessions, x="pct_mouse_click", y="pct_doubleclick")
<Axes: xlabel='pct_mouse_click', ylabel='pct_doubleclick'>
Graf vyjadruje koreláciu medzi "pct_doubleclick" a "pct_mouse_click".
sessions["pct_doubleclick"].corr(sessions["pct_mouse_click"])
0.6655516305017672
Identifikácia problémov v dátach s prvotným riešením¶
Hlib Kokin
Identifikacia problémov v dátach¶
Pozrime sa či sú nejaké stĺpce typu object¶
sessions.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 11104 entries, 0 to 11103 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 product_ean 11104 non-null int64 1 scroll_move_total_rel_distance 11093 non-null float64 2 wild_mouse_duration 11093 non-null float64 3 pct_scroll_move_duration 11093 non-null float64 4 session_duration 11104 non-null float64 5 pct_click 11092 non-null float64 6 total_load_time 11104 non-null float64 7 pct_scroll_move 11092 non-null float64 8 screen_height 11104 non-null object 9 page_activity_duration 11093 non-null float64 10 screen_width 11104 non-null object 11 pct_wild_mouse 11093 non-null float64 12 pct_input 11093 non-null float64 13 pct_mouse_click 11093 non-null float64 14 pct_scrandom 11093 non-null float64 15 ack 11104 non-null float64 16 pct_click_product_info 11092 non-null float64 17 pct_doubleclick 11093 non-null float64 18 pct_mouse_move 11093 non-null float64 19 mouse_move_total_rel_distance 11093 non-null float64 20 session_id 11104 non-null int64 21 pct_rage_click 11093 non-null float64 22 user_id 11104 non-null int64 23 browser_name 11104 non-null object 24 session_start 11104 non-null object dtypes: float64(18), int64(3), object(4) memory usage: 2.1+ MB
sessions.screen_height.unique()
array(['768', '600', '1024', 'lower', '1080', 'higher', '800'],
dtype=object)
sessions.screen_width.unique()
array(['1366', '1280', '1024', 'lower', '1920', 'higher', '800'],
dtype=object)
sessions['screen_width'].isnull().values.any()
False
sessions['screen_height'].isnull().values.any()
False
users.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2528 entries, 0 to 2527 Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 user_id 2528 non-null int64 1 name 2528 non-null object 2 job 1011 non-null object 3 sex 1264 non-null object 4 residence 885 non-null object 5 address 2149 non-null object 6 birthdate 1390 non-null object 7 mail 2528 non-null object 8 username 2528 non-null object 9 current_location 2402 non-null object 10 race 758 non-null object dtypes: int64(1), object(10) memory usage: 217.4+ KB
users.name.unique().shape
(2469,)
users.race.unique()
array([nan, 'unspecified', 'indian', 'white', 'asian', 'black'],
dtype=object)
users.sex.unique()
array(['M', nan, 'F'], dtype=object)
users.shape[0]
2528
users[users['sex'].isnull()].shape[0]
1264
products.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 14972 entries, 0 to 14971 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 code 14955 non-null object 1 store_name 14972 non-null object 2 location 14972 non-null object 3 product_ean 14972 non-null int64 dtypes: int64(1), object(3) memory usage: 468.0+ KB
Tu máme tri stĺpce s typom objektu
products.head(15)
| code | store_name | location | product_ean | |
|---|---|---|---|---|
| 0 | PK | Bhit Shah | Asia/Karachi | 6323711260827 |
| 1 | MM | Kyaikto | Asia/Yangon | 8722236798279 |
| 2 | CN | Jiangyan | Asia/Shanghai | 7371273897806 |
| 3 | MX | Oaxaca | America/Mexico_City | 3758741185803 |
| 4 | IN | Moga | Asia/Kolkata | 3698132657390 |
| 5 | UZ | Jizzax | Asia/Samarkand | 6521391328503 |
| 6 | US | Davis | America/Los_Angeles | 2358131673609 |
| 7 | GB | Devizes | Europe/London | 4680879308733 |
| 8 | BR | Porto Velho | America/Porto_Velho | 9004277473278 |
| 9 | CN | Tianfu | Asia/Shanghai | 8772276925265 |
| 10 | JP | Annaka | Asia/Tokyo | 3837147004465 |
| 11 | EC | Milagro | America/Guayaquil | 3984613159041 |
| 12 | UA | Svatove | Europe/Zaporozhye | 2190577998607 |
| 13 | JP | Fujioka | Asia/Tokyo | 8616592109865 |
| 14 | LT | Ukmerge | Europe/Vilnius | 5598016640723 |
products.code.unique().shape
(134,)
products.location.unique().shape
(184,)
products.location.shape
(14972,)
Pozrime sa na missing values¶
Sessions¶
msno.matrix(sessions)
plt.figure(figsize = (15,9))
plt.show()
<Figure size 1500x900 with 0 Axes>
Dať nie je veľa, vyrátame percenta
sessions[sessions.isna().any(axis=1)].shape[0] / sessions.shape[0] * 100
1.5039625360230549
Users¶
msno.matrix(users)
plt.figure(figsize = (15,9))
plt.show()
<Figure size 1500x900 with 0 Axes>
DataFrame users ma skoro v každom riadku NaN hodnotu
users[users.isna().any(axis=1)].shape[0] / users.shape[0] * 100
98.97151898734177
Stĺpci "race", "sex", "job", "residence", "address" a "birthdate" nemáju vo väčšine riadkov nastavenú žiadnu hodnotu, čo môže predstavovať problém pri určovaní trendov a závislostí
Products¶
msno.matrix(products)
plt.figure(figsize = (15,9))
plt.show()
<Figure size 1500x900 with 0 Axes>
Žiaden problem tu
Pozrime sa na outlierov¶
Sessions¶
sessions_num = sessions.loc[:,sessions.dtypes == 'float64']
sessions_num.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 11104 entries, 0 to 11103 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 scroll_move_total_rel_distance 11093 non-null float64 1 wild_mouse_duration 11093 non-null float64 2 pct_scroll_move_duration 11093 non-null float64 3 session_duration 11104 non-null float64 4 pct_click 11092 non-null float64 5 total_load_time 11104 non-null float64 6 pct_scroll_move 11092 non-null float64 7 page_activity_duration 11093 non-null float64 8 pct_wild_mouse 11093 non-null float64 9 pct_input 11093 non-null float64 10 pct_mouse_click 11093 non-null float64 11 pct_scrandom 11093 non-null float64 12 ack 11104 non-null float64 13 pct_click_product_info 11092 non-null float64 14 pct_doubleclick 11093 non-null float64 15 pct_mouse_move 11093 non-null float64 16 mouse_move_total_rel_distance 11093 non-null float64 17 pct_rage_click 11093 non-null float64 dtypes: float64(18) memory usage: 1.5 MB
sessions_num_melt = pd.melt(sessions_num.drop(columns=['session_duration', 'ack']))
sessions_num_melt.head()
| variable | value | |
|---|---|---|
| 0 | scroll_move_total_rel_distance | 16.21104 |
| 1 | scroll_move_total_rel_distance | 11.76366 |
| 2 | scroll_move_total_rel_distance | 13.48294 |
| 3 | scroll_move_total_rel_distance | 14.58238 |
| 4 | scroll_move_total_rel_distance | 10.57370 |
sns.boxplot(x="value", y="variable", data=sessions_num_melt)
<Axes: xlabel='value', ylabel='variable'>
sns.boxplot(data=sessions_num.iloc[:, 3])
<Axes: ylabel='session_duration'>
V každom z číselných stĺpcov existuje dostatok odľahlých hodnôt
Users, Products¶
Používatelia nemajú stĺpce, v ktorých potrebujeme vidieť odľahlé hodnoty
Sessions¶
Pozrime sa na duplicitné riadky¶
d_sessions = sessions[sessions.duplicated()]
print("Percent duplicitnych riadkov v sessions: " + str(d_sessions.shape[0] / sessions.shape[0] * 100))
Percent duplicitnych riadkov v sessions: 1.1887608069164266
d_sessions.head()
| product_ean | scroll_move_total_rel_distance | wild_mouse_duration | pct_scroll_move_duration | session_duration | pct_click | total_load_time | pct_scroll_move | screen_height | page_activity_duration | ... | ack | pct_click_product_info | pct_doubleclick | pct_mouse_move | mouse_move_total_rel_distance | session_id | pct_rage_click | user_id | browser_name | session_start | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1239 | 533629680853 | 9.55431 | 14.07222 | 11.20358 | 249.74352 | 13.34731 | 4.29222 | 46.08453 | 768 | 12.91603 | ... | 1.0 | 55.05283 | 10.05268 | 10.83486 | 13.91799 | 823393 | 56.75369 | 720 | mobile | 2022-09-28 19:59:24 |
| 1333 | 5243497834696 | 9.68393 | 14.28337 | 12.71928 | 133.04608 | 12.14642 | 1.15648 | 58.76371 | 600 | 13.38122 | ... | 1.0 | 75.95562 | 11.60355 | 10.72007 | 13.76373 | 441346 | 56.12082 | 1035 | opera | 2023-01-26 20:36:52 |
| 1406 | 9352805806778 | 14.58083 | 12.83500 | 11.33966 | 133.53462 | 11.10115 | 3.09888 | 29.16313 | 800 | 10.30587 | ... | 0.0 | 74.33166 | 9.56594 | 11.62037 | 12.85726 | 839343 | 55.10088 | 1701 | other | 2022-01-19 00:14:57 |
| 1936 | 8832168272966 | 10.53058 | 16.26628 | 9.68015 | 95.47645 | 10.77483 | 0.63052 | 57.84443 | 600 | 13.14020 | ... | 0.0 | 60.08062 | 13.33718 | 11.92089 | 8.45825 | 1458895 | 79.11481 | 2147 | firefox | 2023-03-14 15:57:15 |
| 2151 | 7633058796415 | 13.76811 | 10.14203 | 10.73033 | 105.50410 | 9.33366 | 4.80952 | 48.53424 | 768 | 8.53969 | ... | 1.0 | 42.54977 | 8.37466 | 13.34782 | 13.37367 | 1481600 | 50.16869 | 1246 | chrome | 2020-07-04 02:09:55 |
5 rows × 25 columns
Users¶
d_users = users[users.duplicated()]
print("Percent duplicitnych riadkov v users: " + str(d_users.shape[0] / users.shape[0] * 100))
Percent duplicitnych riadkov v users: 0.0
Products¶
d_products = products[products.duplicated()]
print("Percent duplicitnych riadkov v products: " + str(d_products.shape[0] / products.shape[0] * 100))
Percent duplicitnych riadkov v products: 0.0
Návrhy riešení a čiastočné riešenie¶
DType == Object¶
Sessions¶
Existujú dve jedinečné nečíselné hodnoty pre šírku a výšku obrazovky, nahraďme ich minimálnymi a maximálnymi hodnotami a potom pre screen_width a screen_height vyzerá to tak že stačí iba zmeniť typ na float64, lebo tvar dať nám aj tak bude vyhovat a nie su tam žiadne missing value.
Pre stĺpec "browser_name" budeme musieť použiť labelEncoder a údaje o session_start môžeme preložiť na datumovy typ.
Users¶
Stĺpce ako "sex", "race" majú kategorické hodnoty, takže môžeme použiť LabelEncoder.
Jediné, čo môžeme z e-mailov získať, je doména, ostatné je pre nás bezvýznamné. Potom orežeme domény a zakódujeme výsledné kategórie. Prezývky pre nás nie sú zaujímavé, takže ich môžeme vyhodiť
Stĺpce týkajúce sa address a residence nemôžeme nazvať kategorickými, skúsme z nich vybrať aspoň krajinu, a tak ich budeme môcť kódovať už z kategórií
Products¶
Stĺpec s názvom predajne nás nezaujíma, môžeme ho vyhodiť, ostatné stĺpce (kód, umiestnenie) môžeme kódovať podľa názvu. Lokalít je viac ako 150, môžeme vyradiť len kontinent alebo krajinu a získať menší počet kategórií
Riadky s prázdnymi hodnotami¶
Sessions¶
Celkový počet riadkov s prázdnymi hodnotami rovná sa 1.5 percentám od celkového poctu riadkov, môžme ich bud vymazat alebo doplniť na základe veľkého počtu riadkov, v ktorých tieto údaje existujú, čo nebude veľký problém, pretože percento chýbajúcich hodnôt je 1,5 %.
Users¶
Stĺpci "race", "sex", "job", "residence", "address" a "birthdate" majú vysoký počet chýbajúcich hodnôt, riešením tohto problému je doplnenie údajov na základe trendov, ktoré uvidíme pri pohľade na riadky, v ktorých sa údaje nachádzajú, nebude to také jednoduché ako odstránenie menej ako dvoch percent riadkov ako v prípade relácií, ale výsledkom bude zachovanie pravdivosti údajov bez chýbajúcich hodnôt v každom riadku, ak sa správne vypočítajú.
Outliers¶
Sessions¶
V tomto súbore je 16 stĺpcov s odľahlými hodnotami, pravdepodobne najlepším riešením je nastaviť prijateľné krajné hodnoty a odstrániť všetko, čo nie je v ich rozsahu.
Duplicitné hodnoty¶
Duplikátov je menej ako dve percentá a môžeme ich z našich údajov odstrániť.
Hypotézy¶
A - Veľký počet kliknutí myšou odrádza od nákupu¶
Hlib Kokin
Pri pohľade na heatmapu si možno všimnúť silnú negatívnu koreláciu medzi stĺpcami "ack" a "pct_mouse_click", čo znamená, že štatistika nákupu pri prvej návšteve je tým nižšia, čím viac kliknutí myšou používateľ vykoná, a naopak - viac nákupov pri prvej návšteve sa uskutoční pri nízkom počte kliknutí myšou.
copy = sessions.copy()
sns.boxplot(x = "ack", y = "pct_mouse_click", data=copy)
<Axes: xlabel='ack', ylabel='pct_mouse_click'>
copy["ack"].value_counts()
ack 1.0 6059 0.0 5045 Name: count, dtype: int64
sns.histplot(copy["pct_mouse_click"].dropna())
<Axes: xlabel='pct_mouse_click', ylabel='Count'>
bought_nothing_A = copy[copy["ack"]==0].pct_mouse_click
bought_something_A = copy[copy["ack"]==1].pct_mouse_click
bought_nothing_A = bought_nothing_A.dropna()
bought_something_A = bought_something_A.dropna()
sns.histplot(bought_nothing_A)
<Axes: xlabel='pct_mouse_click', ylabel='Count'>
bins = np.linspace(4, 20, 100)
plt.hist(bought_something_A, bins, alpha=0.67, label="Did buy", color="green")
plt.hist(bought_nothing_A, bins, alpha=0.67, label="Didn't buy", color="orange")
plt.legend()
plt.xlabel("Total Rel Mouse Distance")
plt.ylabel("Count")
plt.show()
Teraz odstránime odľahlé hodnoty
bought_something_mean, bought_something_std = np.mean(bought_something_A), np.std(bought_something_A)
print('len=', len(bought_something_A), 'mean=', bought_something_mean, 'std=', bought_something_std)
cut_off = bought_something_std * 3
lower, upper = bought_something_mean - cut_off, bought_something_mean + cut_off
print('cutoff=', cut_off, 'lower=', lower, 'upper=', upper)
outliers = [x for x in bought_something_A if x < lower or x > upper]
print('Identified outliers: %d' % len(outliers))
# remove outliers
outliers_removed = [x for x in bought_something_A if x >= lower and x <= upper]
print('Non-outlier observations: %d' % len(outliers_removed))
bought_something_A = outliers_removed
sns.histplot(bought_something_A)
len= 6050 mean= 10.146079702479339 std= 1.5814811336426937 cutoff= 4.744443400928081 lower= 5.401636301551258 upper= 14.890523103407421 Identified outliers: 26 Non-outlier observations: 6024
<Axes: ylabel='Count'>
Zistime normalnost pomocou shapiroveho testu
shapiro_test = stats.shapiro(bought_something_A)
print(shapiro_test)
alpha = 0.05
if shapiro_test.pvalue > alpha:
print('Normal distribution (fail to reject H0)')
else:
print('Another distributions (reject H0)')
ShapiroResult(statistic=0.9984059929847717, pvalue=8.655034434923436e-06) Another distributions (reject H0)
C:\Users\peter\AppData\Local\Programs\Python\Python310\lib\site-packages\scipy\stats\_morestats.py:1882: UserWarning: p-value may not be accurate for N > 5000.
warnings.warn("p-value may not be accurate for N > 5000.")
shapiro_test = stats.shapiro(bought_nothing_A)
print(shapiro_test)
alpha = 0.05
if shapiro_test.pvalue > alpha:
print('Normal distribution (fail to reject H0)')
else:
print('Another distributions (reject H0)')
ShapiroResult(statistic=0.9776387214660645, pvalue=1.8879361614093367e-27) Another distributions (reject H0)
V oboch testoch bola hypotéza normality zamietnutá, čo znamená, že medzi oboma stĺpcami existuje rozdiel. Pozrime sa na signifikantnost tohto rozdielu
stat, p = mannwhitneyu(bought_something_A, bought_nothing_A)
print('Statistics=%.3f, p=%.3f' % (stat, p))
alpha = 0.05
if p > alpha:
print('Same distribution (fail to reject H0)')
else:
print('Different distribution (reject H0)')
Statistics=2422810.500, p=0.000 Different distribution (reject H0)
def cohend(d1, d2):
# calculate the size of samples
n1, n2 = len(d1), len(d2)
# calculate the variance of the samples
s1, s2 = np.var(d1, ddof=1), np.var(d2, ddof=1)
# calculate the pooled standard deviation
s = np.sqrt(((n1 - 1) * s1 + (n2 - 1) * s2) / (n1 + n2 - 2))
# calculate the means of the samples
u1, u2 = np.mean(d1), np.mean(d2)
# calculate the effect size
d = (u1 - u2) / s
return d
sns.distplot(bought_something_A, bins=10)
sns.distplot(bought_nothing_A, bins=10)
cd = cohend(bought_something_A, bought_nothing_A)
print("Statistical power analysis:")
if 0.2 <= cd < 0.5:
print('Small effect - Cohens d value: %f' % cd)
elif 0.5 <= cd < 0.8:
print('Medium effect - Cohens d value: %f' % cd)
elif 0.8 <= cd:
print('Large effect - Cohens d value: %f' % cd)
C:\Users\peter\AppData\Local\Temp\ipykernel_34140\4194722491.py:14: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(bought_something_A, bins=10) C:\Users\peter\AppData\Local\Temp\ipykernel_34140\4194722491.py:15: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(bought_nothing_A, bins=10)
Statistical power analysis:
Hypotéza je štatisticky viditeľná (p == 0)
print('Confidence interval pre nákup', sms.DescrStatsW(bought_something_A).tconfint_mean())
print('Confidence interval pre žiadny nákup', sms.DescrStatsW(bought_nothing_A).tconfint_mean())
Confidence interval pre nákup (10.091111948012804, 10.168834891296624) Confidence interval pre žiadny nákup (13.245254114495506, 13.333753547610382)
Štatisticky sme dokázali, že táto hypotéza je pravdivá, t. j. že viac kliknutí myšou používateľa odradí od nákupu. Histogram ukazuje, že na uskutočnenie nákupu pri prvej návšteve je v priemere potrebných 8 kliknutí, ale ak táto hodnota narastie na 14, nákup sa s najväčšou pravdepodobnosťou neuskutoční. Možno je to preto, že osoba, ktorá na začiatku vie, čo chce kúpiť, nevynaloží veľa kliknutí na kúpu už vybraného výrobku, zatiaľ čo používateľ, ktorý nevie, čo potrebuje, alebo sa len prišiel pozrieť, bude listovať v zoznamoch výrobkov bez toho, aby si vybral niečo na okamžitý nákup.
B - Totálna prejdená vzdialenosť myši inklinuje používateľa k nákupu¶
Peter Bartoš
Po prezrení hodnôt v heatmape vidieť tiež značnú kladnú koreláciu medzi stĺpcom "ack" a "mouse_move_total_rel_distance". Toto môže znamenať, že čím väčšiu má myš prejdenú totálnu vzdialenosť, tak sa zlepšuje štatistika nákupu.
tmd = sessions.copy()
sns.boxplot(x = "ack", y = "mouse_move_total_rel_distance", data=tmd)
<Axes: xlabel='ack', ylabel='mouse_move_total_rel_distance'>
tmd["ack"].value_counts()
ack 1.0 6059 0.0 5045 Name: count, dtype: int64
sns.histplot(tmd["mouse_move_total_rel_distance"])
<Axes: xlabel='mouse_move_total_rel_distance', ylabel='Count'>
bought_nothing = tmd[tmd["ack"]==0].mouse_move_total_rel_distance
bought_something = tmd[tmd["ack"]==1].mouse_move_total_rel_distance
bought_nothing = bought_nothing.dropna()
bought_something = bought_something.dropna()
sns.histplot(bought_nothing)
<Axes: xlabel='mouse_move_total_rel_distance', ylabel='Count'>
sns.histplot(bought_something)
<Axes: xlabel='mouse_move_total_rel_distance', ylabel='Count'>
Teraz vymažeme outliers z dát, aby boli presnejšie.
bought_something_mean, bought_something_std = np.mean(bought_something), np.std(bought_something)
print('len=', len(bought_something), 'mean=', bought_something_mean, 'std=', bought_something_std)
cut_off = bought_something_std * 3
lower, upper = bought_something_mean - cut_off, bought_something_mean + cut_off
print('cutoff=', cut_off, 'lower=', lower, 'upper=', upper)
outliers = [x for x in bought_something if x < lower or x > upper]
print('Identified outliers: %d' % len(outliers))
# remove outliers
outliers_removed = [x for x in bought_something if x >= lower and x <= upper]
print('Non-outlier observations: %d' % len(outliers_removed))
bought_something = outliers_removed
sns.histplot(bought_something)
len= 6054 mean= 13.398524517674264 std= 1.2823036529406053 cutoff= 3.846910958821816 lower= 9.551613558852448 upper= 17.24543547649608 Identified outliers: 70 Non-outlier observations: 5984
<Axes: ylabel='Count'>
bought_nothing_mean, bought_nothing_std = np.mean(bought_nothing), np.std(bought_nothing)
print('len=', len(bought_nothing), 'mean=', bought_nothing_mean, 'std=', bought_nothing_std)
cut_off = bought_nothing_std * 3
lower, upper = bought_nothing_mean - cut_off, bought_nothing_mean + cut_off
print('cutoff=', cut_off, 'lower=', lower, 'upper=', upper)
outliers = [x for x in bought_nothing if x < lower or x > upper]
print('Identified outliers: %d' % len(outliers))
# remove outliers
outliers_removed = [x for x in bought_nothing if x >= lower and x <= upper]
print('Non-outlier observations: %d' % len(outliers_removed))
bought_nothing = outliers_removed
sns.histplot(bought_nothing)
len= 5039 mean= 11.832656328636634 std= 2.3222833595969594 cutoff= 6.966850078790879 lower= 4.865806249845756 upper= 18.799506407427515 Identified outliers: 3 Non-outlier observations: 5036
<Axes: ylabel='Count'>
bins = np.linspace(4, 20, 100)
plt.hist(bought_something, bins, alpha=0.67, label="Did buy", color="green")
plt.hist(bought_nothing, bins, alpha=0.67, label="Didn't buy", color="orange")
plt.legend()
plt.xlabel("Total Rel Mouse Distance")
plt.ylabel("Count")
plt.show()
Vyzualizovali sme dáta pre lepšie pochopenie, prezretie rozdielov a narábanie s nimi. Teraz potrebujeme zistiť normálnosť. To znamená, že z akého rozdelenia dáta pochádzajú. Shapirov štatistický test nám pomôže určiť, či sú dáta z normálového rozdelenia.
shapiro_test = stats.shapiro(bought_something)
print(shapiro_test)
alpha = 0.05
if shapiro_test.pvalue > alpha:
print('Normal distribution (fail to reject H0)')
else:
print('Another distributions (reject H0)')
ShapiroResult(statistic=0.9890832901000977, pvalue=4.110146564883055e-21) Another distributions (reject H0)
C:\Users\peter\AppData\Local\Programs\Python\Python310\lib\site-packages\scipy\stats\_morestats.py:1882: UserWarning: p-value may not be accurate for N > 5000.
warnings.warn("p-value may not be accurate for N > 5000.")
shapiro_test = stats.shapiro(bought_nothing)
print(shapiro_test)
alpha = 0.05
if shapiro_test.pvalue > alpha:
print('Normal distribution (fail to reject H0)')
else:
print('Another distributions (reject H0)')
ShapiroResult(statistic=0.9936949610710144, pvalue=4.155800120806026e-14) Another distributions (reject H0)
Pri obidvoch sa zamietlo H0, keďže pri obidvoch bola p-hodnota menšia ako alpha (0.05). Ďalej vykoná mannwhitneyov test, ktorý nám určí signifikantnosť rozdielu týchto dvoch vzoriek dát.
stat, p = mannwhitneyu(bought_something, bought_nothing)
print('Statistics=%.3f, p=%.3f' % (stat, p))
alpha = 0.05
if p > alpha:
print('Same distribution (fail to reject H0)')
else:
print('Different distribution (reject H0)')
Statistics=21538743.500, p=0.000 Different distribution (reject H0)
Vyšla p-hodnota skoro nulová, čo znamená, že táto hypotéza je naozaj štatisticky viditeľná.
sns.distplot(bought_something, bins=10)
sns.distplot(bought_nothing, bins=10)
cd = cohend(bought_something, bought_nothing)
print("Statistical power analysis:")
if 0.2 <= cd < 0.5:
print('Small effect - Cohens d value: %f' % cd)
elif 0.5 <= cd < 0.8:
print('Medium effect - Cohens d value: %f' % cd)
elif 0.8 <= cd:
print('Large effect - Cohens d value: %f' % cd)
C:\Users\peter\AppData\Local\Temp\ipykernel_34140\249406191.py:1: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(bought_something, bins=10) C:\Users\peter\AppData\Local\Temp\ipykernel_34140\249406191.py:2: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(bought_nothing, bins=10)
Statistical power analysis: Large effect - Cohens d value: 0.887452
print('Confidence interval pre nákup', sms.DescrStatsW(bought_something).tconfint_mean())
print('Confidence interval pre žiadny nákup', sms.DescrStatsW(bought_nothing).tconfint_mean())
Confidence interval pre nákup (13.396517465183974, 13.456838353666297) Confidence interval pre žiadny nákup (11.770243676284359, 11.898194588211268)
bought = pd.DataFrame(bought_something)
didnt_buy = pd.DataFrame(bought_nothing)
print("bought mean=", bought.mean())
print("didnt_buy mean=", didnt_buy.mean())
sns.histplot(bought[0], bins=5, color="green")
sns.histplot(didnt_buy[0], bins=5, color="orange")
bought mean= 0 13.426678 dtype: float64 didnt_buy mean= 0 11.834219 dtype: float64
<Axes: xlabel='0', ylabel='Count'>
Dokázalo sa teda, že naozaj čím človek prejde väčšiu vzdialenosť kurzorom, tak tým viacej inklinuje k nákupu. Z vizualizácie dát na grafoch ale vidno, že to platí len do určitého ohraničenia tej totálnej vzdialenosti (~15). Odtiaľ vyššie je to ťažko určiteľné, keďže sa tam krivky prelínajú.
Záver¶
Údaje nie sú vo vhodnom formáte na trénovanie modelu, niektoré typy stĺpcov je potrebné zmeniť kódovaním ( napriklad LabelEncoder: object -> float64), nahradením konštantami alebo konverziou na správny formát dátového typu, riešenie chýbajúcich hodnôt ich nahradením mediánovou hodnotou, odstránenie odľahlých hodnôt zameraním sa na median a odstránenie duplikátov.
Integrácia a čistenie dát¶
Peter Bartoš
Načítanie dát¶
df = pd.merge(sessions, users, on=["user_id", "user_id"])
df.head()
| product_ean | scroll_move_total_rel_distance | wild_mouse_duration | pct_scroll_move_duration | session_duration | pct_click | total_load_time | pct_scroll_move | screen_height | page_activity_duration | ... | name | job | sex | residence | address | birthdate | username | current_location | race | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2579057139158 | 11.76366 | 13.33927 | 11.61679 | 241.51168 | 14.64515 | 6.16807 | 47.62969 | 600 | 14.73176 | ... | Bárbara Duarte | NaN | F | NaN | Largo de da Rosa, 634\nHorto\n37449-441 Barbos... | NaN | pedro-henriquefarias@bol.com.br | duartepedro | (Decimal('-64.822219'), Decimal('108.506880')) | black |
| 1 | 4220331404902 | 11.06654 | 13.35117 | 11.52874 | 290.70564 | 13.32630 | 1.81609 | 44.98595 | 768 | 14.63178 | ... | Bárbara Duarte | NaN | F | NaN | Largo de da Rosa, 634\nHorto\n37449-441 Barbos... | NaN | pedro-henriquefarias@bol.com.br | duartepedro | (Decimal('-64.822219'), Decimal('108.506880')) | black |
| 2 | 1862033634091 | 14.40034 | 16.76548 | 12.14671 | 150.61361 | 11.92945 | 4.41707 | 54.79054 | 768 | 10.94045 | ... | Bárbara Duarte | NaN | F | NaN | Largo de da Rosa, 634\nHorto\n37449-441 Barbos... | NaN | pedro-henriquefarias@bol.com.br | duartepedro | (Decimal('-64.822219'), Decimal('108.506880')) | black |
| 3 | 9194511663098 | 9.48744 | 9.73660 | 11.59939 | 167.87371 | 9.26560 | 0.78185 | 49.04837 | 600 | 12.96073 | ... | Bárbara Duarte | NaN | F | NaN | Largo de da Rosa, 634\nHorto\n37449-441 Barbos... | NaN | pedro-henriquefarias@bol.com.br | duartepedro | (Decimal('-64.822219'), Decimal('108.506880')) | black |
| 4 | 924520065762 | 10.57724 | 13.65790 | 12.29138 | 120.97370 | 11.30114 | 2.49417 | 33.40232 | 768 | 11.78523 | ... | Bárbara Duarte | NaN | F | NaN | Largo de da Rosa, 634\nHorto\n37449-441 Barbos... | NaN | pedro-henriquefarias@bol.com.br | duartepedro | (Decimal('-64.822219'), Decimal('108.506880')) | black |
5 rows × 35 columns
Vyhodenie nepotrebných stĺpcov¶
Najprv vyhodíme nepotrebné alebo veľmi ťažko využiteľné stĺpce z dataframe-u, ako napríklad "name", "residence", "address", "current_location", "job",... Vyhodíme ich preto, lebo sú to stringy a ťažko nájsť niečo, čo by nám tie stringy reprezentovalo.
def drop_useless_cols(data):
return data.drop(["name", "job", "residence", "address", "username", "current_location", "screen_width", "screen_height"], axis=1)
Teraz vyhodíme stĺpce so zanedbateľnými koreláciami (cor<0.01) voči "ack" stĺpcu, ktoré boli vyzualizované v heatmape v prvej fáze.
def drop_low_cor_cols(data):
data = drop_useless_cols(data)
data = data.drop(["product_ean", "session_duration", "pct_scroll_move_duration", "total_load_time", "pct_scroll_move", "page_activity_duration", "pct_click_product_info"], axis=1)
data = data.drop(["pct_scrandom", "pct_mouse_move", "session_id", "pct_rage_click", "user_id", "pct_wild_mouse"], axis=1)
return data
df = drop_low_cor_cols(df)
df.head()
| scroll_move_total_rel_distance | wild_mouse_duration | pct_click | pct_input | pct_mouse_click | ack | pct_doubleclick | mouse_move_total_rel_distance | browser_name | session_start | sex | birthdate | race | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 11.76366 | 13.33927 | 14.64515 | 0.24442 | 8.40200 | 1.0 | 8.03629 | 12.18790 | safari | 2019-11-25 19:58:57 | F | NaN | pedro-henriquefarias@bol.com.br | black |
| 1 | 11.06654 | 13.35117 | 13.32630 | 0.24731 | 9.16945 | 1.0 | 9.62225 | 12.85323 | chrome | 2022-06-30 13:36:36 | F | NaN | pedro-henriquefarias@bol.com.br | black |
| 2 | 14.40034 | 16.76548 | 11.92945 | 7.53010 | 12.59008 | 0.0 | 6.60933 | 14.34023 | edge | 2020-08-05 18:12:38 | F | NaN | pedro-henriquefarias@bol.com.br | black |
| 3 | 9.48744 | 9.73660 | 9.26560 | 0.00660 | 9.43816 | 1.0 | 14.85800 | 11.86188 | chrome | 2022-12-20 10:29:17 | F | NaN | pedro-henriquefarias@bol.com.br | black |
| 4 | 10.57724 | 13.65790 | 11.30114 | 0.33680 | 14.44700 | 0.0 | 15.71613 | 7.77553 | safari | 2020-04-30 07:06:58 | F | NaN | pedro-henriquefarias@bol.com.br | black |
Čistenie dát¶
Chýbajúce hodnoty a ich doplnenie¶
df.isnull().sum()
scroll_move_total_rel_distance 11 wild_mouse_duration 14 pct_click 9 pct_input 13 pct_mouse_click 11 ack 0 pct_doubleclick 12 mouse_move_total_rel_distance 10 browser_name 0 session_start 0 sex 5549 birthdate 4942 mail 0 race 7786 dtype: int64
Dopĺňanie dát pomocou mean:
def fillout_all_with_mean(data):
mean = data["scroll_move_total_rel_distance"].mean()
data["scroll_move_total_rel_distance"] = data["scroll_move_total_rel_distance"].fillna(mean)
mean = data["pct_click"].mean()
data["pct_click"] = data["pct_click"].fillna(mean)
mean = data["pct_input"].mean()
data["pct_input"] = data["pct_input"].fillna(mean)
mean = data["wild_mouse_duration"].mean()
data["wild_mouse_duration"] = data["wild_mouse_duration"].fillna(mean)
mean = data["pct_doubleclick"].mean()
data["pct_doubleclick"] = data["pct_doubleclick"].fillna(mean)
mean = data["mouse_move_total_rel_distance"].mean()
data["mouse_move_total_rel_distance"] = data["mouse_move_total_rel_distance"].fillna(mean)
mean = data["pct_mouse_click"].mean()
data["pct_mouse_click"] = data["pct_mouse_click"].fillna(mean)
return data
def fillout_with_mean(data):
mean = data["scroll_move_total_rel_distance"].mean()
data["scroll_move_total_rel_distance"] = data["scroll_move_total_rel_distance"].fillna(mean)
mean = data["pct_click"].mean()
data["pct_click"] = data["pct_click"].fillna(mean)
mean = data["pct_input"].mean()
data["pct_input"] = data["pct_input"].fillna(mean)
return data
filled_out_means = fillout_with_mean(df)
Dopĺňanie dát pomocou median:
def fillout_with_median(data):
median = data["wild_mouse_duration"].median()
data["wild_mouse_duration"] = data["wild_mouse_duration"].fillna(median)
median = data["pct_doubleclick"].median()
data["pct_doubleclick"] = data["pct_doubleclick"].fillna(median)
median = data["mouse_move_total_rel_distance"].median()
data["mouse_move_total_rel_distance"] = data["mouse_move_total_rel_distance"].fillna(median)
return data
filled_out_median = fillout_with_median(df)
Dopĺňanie dát pomocou KNN:
def fillout_all_with_knn(data):
imputer = KNNImputer()
imputed_data = pd.DataFrame(imputer.fit_transform(data[["scroll_move_total_rel_distance"]]))
data["scroll_move_total_rel_distance"]=imputed_data[0].values
imputed_data = pd.DataFrame(imputer.fit_transform(data[["pct_click"]]))
data["pct_click"]=imputed_data[0].values
imputed_data = pd.DataFrame(imputer.fit_transform(data[["pct_input"]]))
data["pct_input"]=imputed_data[0].values
imputed_data = pd.DataFrame(imputer.fit_transform(data[["wild_mouse_duration"]]))
data["wild_mouse_duration"]=imputed_data[0].values
imputed_data = pd.DataFrame(imputer.fit_transform(data[["pct_doubleclick"]]))
data["pct_doubleclick"]=imputed_data[0].values
imputed_data = pd.DataFrame(imputer.fit_transform(data[["mouse_move_total_rel_distance"]]))
data["mouse_move_total_rel_distance"]=imputed_data[0].values
imputed_data = pd.DataFrame(imputer.fit_transform(data[["pct_mouse_click"]]))
data["pct_mouse_click"]=imputed_data[0].values
return data
def fillout_with_knn(data):
imputer = KNNImputer()
imputed_data = pd.DataFrame(imputer.fit_transform(data[["pct_mouse_click"]]))
data["pct_mouse_click"]=imputed_data[0].values
return data
filled_out_knn=fillout_with_knn(df)
Pipeline pre doplnenie dát:
df = fillout_with_mean(df)
df = fillout_with_median(df)
df = fillout_with_knn(df)
df.head()
| scroll_move_total_rel_distance | wild_mouse_duration | pct_click | pct_input | pct_mouse_click | ack | pct_doubleclick | mouse_move_total_rel_distance | browser_name | session_start | sex | birthdate | race | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 11.76366 | 13.33927 | 14.64515 | 0.24442 | 8.40200 | 1.0 | 8.03629 | 12.18790 | safari | 2019-11-25 19:58:57 | F | NaN | pedro-henriquefarias@bol.com.br | black |
| 1 | 11.06654 | 13.35117 | 13.32630 | 0.24731 | 9.16945 | 1.0 | 9.62225 | 12.85323 | chrome | 2022-06-30 13:36:36 | F | NaN | pedro-henriquefarias@bol.com.br | black |
| 2 | 14.40034 | 16.76548 | 11.92945 | 7.53010 | 12.59008 | 0.0 | 6.60933 | 14.34023 | edge | 2020-08-05 18:12:38 | F | NaN | pedro-henriquefarias@bol.com.br | black |
| 3 | 9.48744 | 9.73660 | 9.26560 | 0.00660 | 9.43816 | 1.0 | 14.85800 | 11.86188 | chrome | 2022-12-20 10:29:17 | F | NaN | pedro-henriquefarias@bol.com.br | black |
| 4 | 10.57724 | 13.65790 | 11.30114 | 0.33680 | 14.44700 | 0.0 | 15.71613 | 7.77553 | safari | 2020-04-30 07:06:58 | F | NaN | pedro-henriquefarias@bol.com.br | black |
df.isnull().sum()
scroll_move_total_rel_distance 0 wild_mouse_duration 0 pct_click 0 pct_input 0 pct_mouse_click 0 ack 0 pct_doubleclick 0 mouse_move_total_rel_distance 0 browser_name 0 session_start 0 sex 5549 birthdate 4942 mail 0 race 7786 dtype: int64
Vyhodenie duplicitných záznamov¶
dups = df[df.duplicated()]
df = df.drop_duplicates()
print("Počet záznamov v dataframe:", len(df))
print("Počet vymazaných duplicitných záznamov v dataframe:", len(dups))
Počet záznamov v dataframe: 10927 Počet vymazaných duplicitných záznamov v dataframe: 121
Nahradenie vychýlených hodnôt¶
Zobrazíme grafy jednotlivých stĺpcov a prezrieme outliers.
fig = plt.figure(figsize = (15,20))
ax = fig.gca()
cols = ["scroll_move_total_rel_distance", "wild_mouse_duration", "pct_click", "pct_mouse_click", "pct_input",
"pct_doubleclick", "mouse_move_total_rel_distance"]
df[cols].hist(ax = ax)
C:\Users\peter\AppData\Local\Temp\ipykernel_34140\1366790592.py:5: UserWarning: To output multiple subplots, the figure containing the passed axes is being cleared. df[cols].hist(ax = ax)
array([[<Axes: title={'center': 'scroll_move_total_rel_distance'}>,
<Axes: title={'center': 'wild_mouse_duration'}>,
<Axes: title={'center': 'pct_click'}>],
[<Axes: title={'center': 'pct_mouse_click'}>,
<Axes: title={'center': 'pct_input'}>,
<Axes: title={'center': 'pct_doubleclick'}>],
[<Axes: title={'center': 'mouse_move_total_rel_distance'}>,
<Axes: >, <Axes: >]], dtype=object)
Po prezretí vychýlených hodnôt v grafoch v určitých stĺpcoch sa potom použije funkcia a ňou priradíme do kvantilov vychýlené hodnoty.
def identify_and_replace_outliers(data, column_name):
q5, q95 = data[column_name].quantile(0.05), data[column_name].quantile(0.95)
iqr = q95 - q5
# calculate the outlier cutoff
cut_off = iqr * 1.5
lower, upper = q5 - cut_off, q95 + cut_off
# identify outliers
outliers = [x for x in data[column_name] if x < lower or x > upper]
# remove outliers
return data.loc[(data[column_name] >= lower) & (data[column_name] <= upper)]
for col_name in cols:
df = identify_and_replace_outliers(df, col_name)
Po transformácií sa zobrazia znova grafy pre vizualizáciu výsledku:
fig = plt.figure(figsize = (15,20))
ax = fig.gca()
df[cols].hist(ax = ax)
C:\Users\peter\AppData\Local\Temp\ipykernel_34140\1632170498.py:3: UserWarning: To output multiple subplots, the figure containing the passed axes is being cleared. df[cols].hist(ax = ax)
array([[<Axes: title={'center': 'scroll_move_total_rel_distance'}>,
<Axes: title={'center': 'wild_mouse_duration'}>,
<Axes: title={'center': 'pct_click'}>],
[<Axes: title={'center': 'pct_mouse_click'}>,
<Axes: title={'center': 'pct_input'}>,
<Axes: title={'center': 'pct_doubleclick'}>],
[<Axes: title={'center': 'mouse_move_total_rel_distance'}>,
<Axes: >, <Axes: >]], dtype=object)
Realizácia predspracovania dát¶
Hlib Kokin
Rozdelenie na tréningové a testovacie množiny¶
Vytvorenie x (pre hodnoty) a y (pre značky)¶
x = df.drop("ack", axis=1)
x.info()
<class 'pandas.core.frame.DataFrame'> Index: 10785 entries, 0 to 11047 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 scroll_move_total_rel_distance 10785 non-null float64 1 wild_mouse_duration 10785 non-null float64 2 pct_click 10785 non-null float64 3 pct_input 10785 non-null float64 4 pct_mouse_click 10785 non-null float64 5 pct_doubleclick 10785 non-null float64 6 mouse_move_total_rel_distance 10785 non-null float64 7 browser_name 10785 non-null object 8 session_start 10785 non-null object 9 sex 5369 non-null object 10 birthdate 5969 non-null object 11 mail 10785 non-null object 12 race 3185 non-null object dtypes: float64(7), object(6) memory usage: 1.2+ MB
y = df.ack
y.info()
<class 'pandas.core.series.Series'> Index: 10785 entries, 0 to 11047 Series name: ack Non-Null Count Dtype -------------- ----- 10785 non-null float64 dtypes: float64(1) memory usage: 168.5 KB
Splitting¶
Split it 81.14% to 18.86% which gives us training set of 8750, perfectly suitable for batch size divisable by 25
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8114)
print(x_train.shape)
print(x_test.shape)
(8750, 13) (2035, 13)
Transformovanie dát¶
Predspracovanie atribútu "session_start"¶
Transformujeme session_start na binárnu premennú is_weekend (nová funkcia)
x_train.info()
<class 'pandas.core.frame.DataFrame'> Index: 8750 entries, 3617 to 2181 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 scroll_move_total_rel_distance 8750 non-null float64 1 wild_mouse_duration 8750 non-null float64 2 pct_click 8750 non-null float64 3 pct_input 8750 non-null float64 4 pct_mouse_click 8750 non-null float64 5 pct_doubleclick 8750 non-null float64 6 mouse_move_total_rel_distance 8750 non-null float64 7 browser_name 8750 non-null object 8 session_start 8750 non-null object 9 sex 4385 non-null object 10 birthdate 4848 non-null object 11 mail 8750 non-null object 12 race 2580 non-null object dtypes: float64(7), object(6) memory usage: 957.0+ KB
x_train.head()
| scroll_move_total_rel_distance | wild_mouse_duration | pct_click | pct_input | pct_mouse_click | pct_doubleclick | mouse_move_total_rel_distance | browser_name | session_start | sex | birthdate | race | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3617 | 15.37356 | 12.64546 | 11.70060 | 0.12209 | 13.18518 | 8.90191 | 13.12975 | chrome | 2021-04-20 10:05:59 | M | 1930-09-28 | isaacda-cruz@bol.com.br | NaN |
| 4552 | 10.17630 | 14.56998 | 16.07030 | 0.83816 | 8.78840 | 7.69944 | 12.56797 | edge | 2021-08-09 04:34:58 | F | 1993-01-12 | pito@hotmail.com | NaN |
| 7333 | 14.93004 | 12.39987 | 11.33534 | 0.09540 | 14.69664 | 9.21089 | 13.01998 | mobile | 2020-12-06 18:10:29 | NaN | 1994-12-04 | gibsondarren@gmail.com | NaN |
| 2127 | 9.93342 | 13.23541 | 12.76841 | 0.22075 | 9.76927 | 10.87443 | 13.57756 | edge | 2021-03-12 15:01:51 | M | NaN | hjohnson@hotmail.com | NaN |
| 4591 | 7.60152 | 15.75110 | 14.07854 | 2.73049 | 11.54801 | 10.62808 | 14.54479 | edge | 2022-12-03 04:49:46 | NaN | 1908-11-10 | maysa56@gmail.com | NaN |
def preprocess_session_start(x_train) :
x_train['session_start'] = pd.to_datetime(x_train['session_start'])
x_train['is_weekend'] = x_train['session_start'].dt.weekday // 5 # 0 for Monday-Friday (workdays), 1 for Saturday-Sunday (weekends)
x_train = x_train.drop('session_start', axis=1)
return x_train
x_train = preprocess_session_start(x_train)
x_test = preprocess_session_start(x_test)
x_train.info()
<class 'pandas.core.frame.DataFrame'> Index: 8750 entries, 3617 to 2181 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 scroll_move_total_rel_distance 8750 non-null float64 1 wild_mouse_duration 8750 non-null float64 2 pct_click 8750 non-null float64 3 pct_input 8750 non-null float64 4 pct_mouse_click 8750 non-null float64 5 pct_doubleclick 8750 non-null float64 6 mouse_move_total_rel_distance 8750 non-null float64 7 browser_name 8750 non-null object 8 sex 4385 non-null object 9 birthdate 4848 non-null object 10 mail 8750 non-null object 11 race 2580 non-null object 12 is_weekend 8750 non-null int32 dtypes: float64(7), int32(1), object(5) memory usage: 922.9+ KB
Predspracovanie atribútu "sex"¶
x_train["sex"].describe()
count 4385 unique 2 top F freq 2240 Name: sex, dtype: object
x_train["sex"].unique()
array(['M', 'F', nan], dtype=object)
def encode_with_onehot_encoder(x_train, column_to_encode):
enc = OneHotEncoder()
x_train[column_to_encode] = enc.fit_transform(x_train[column_to_encode].values.reshape(-1,1)).toarray()
return x_train
def preprocess_sex(x_train):
x_train = encode_with_onehot_encoder(x_train, "sex")
return x_train
x_train = preprocess_sex(x_train)
x_test = preprocess_sex(x_test)
x_train['sex'].describe()
count 8750.000000 mean 0.256000 std 0.436447 min 0.000000 25% 0.000000 50% 0.000000 75% 1.000000 max 1.000000 Name: sex, dtype: float64
Predspracovanie atribútu "race"¶
Dataset ma špecifikované 4 rasy, jedna hodnota je unspecified a posledná je nan. Tieto hodnoty si namapujeme na 0 až 5.
x_train["race"].describe()
count 2580 unique 5 top unspecified freq 832 Name: race, dtype: object
x_train["race"].unique()
array([nan, 'unspecified', 'asian', 'indian', 'white', 'black'],
dtype=object)
def encode_with_label_encoder(x_train, column_to_encode):
enc = LabelEncoder()
x_train[column_to_encode] = enc.fit_transform(x_train[column_to_encode])
print(dict(zip(enc.classes_, enc.transform(enc.classes_))))
return x_train
def preprocess_race(x_train):
x_train = encode_with_label_encoder(x_train, "race")
return x_train
x_train = preprocess_race(x_train)
x_test = preprocess_race(x_test)
{'asian': 0, 'black': 1, 'indian': 2, 'unspecified': 3, 'white': 4, nan: 5}
{'asian': 0, 'black': 1, 'indian': 2, 'unspecified': 3, 'white': 4, nan: 5}
Predspracovanie atribútu "birthdate"¶
Dataset ma špecifikovaný stĺpec "birthdate", kde treba extrahovať vek usera a nan konvertovať na číslo (-1 bude reprezentovať nan, kvázi undefined).
x_train["birthdate"].describe()
count 4848 unique 1306 top 2004-11-02 freq 15 Name: birthdate, dtype: object
def get_age(birthdate):
if pd.isna(birthdate):
return -1
birthdate = datetime.datetime.strptime(birthdate, "%Y-%m-%d").date()
today = datetime.date.today()
return today.year - birthdate.year - ((today.month,today.day) < (birthdate.month,birthdate.day))
def preprocess_birthdate(x_train):
x_train["birthdate"] = x_train["birthdate"].apply(get_age)
return x_train
x_train = preprocess_birthdate(x_train)
x_test = preprocess_birthdate(x_test)
x_train.birthdate
3617 93
4552 30
7333 29
2127 -1
4591 115
...
6538 -1
6566 -1
4033 -1
10159 22
2181 -1
Name: birthdate, Length: 8750, dtype: int64
Predspracovanie atribútu "email"¶
Dataset ma špecifikovaný stĺpec "email", kde sú uložené emailové adresy používateľov. Z týchto emailových adries sa dajú extrahovať najzaužívanejšie domény a reprezentovať ich číslami.
x_train["mail"].head()
3617 isaacda-cruz@bol.com.br 4552 pito@hotmail.com 7333 gibsondarren@gmail.com 2127 hjohnson@hotmail.com 4591 maysa56@gmail.com Name: mail, dtype: object
x_train["mail"].describe()
count 8750 unique 2411 top ryosuke30@yahoo.com freq 12 Name: mail, dtype: object
def preprocess_email(x_train):
x_train["mail"] = x_train["mail"].str.split("@").str[1].str.split(".").str[0]
print(x_train["mail"].unique())
x_train["mail"] = pd.factorize(x_train["mail"])[0]
print(x_train['mail'].unique())
return x_train
x_train = preprocess_email(x_train)
x_test = preprocess_email(x_test)
['bol' 'hotmail' 'gmail' 'tin' 'seznam' 'chello' 'libero' 'aol' 'centrum' 'googlemail' 'live' 'yahoo' 'ig' 'poste' 'virgilio' 'post' 'gmx' 'volny' 'vodafone' 'tim' 'outlook' 'uol' 'email' 'fastwebnet' 'tiscali' 'web' 'alice' 'tele2'] [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27] ['post' 'live' 'gmx' 'gmail' 'aol' 'uol' 'yahoo' 'googlemail' 'libero' 'hotmail' 'alice' 'centrum' 'seznam' 'bol' 'chello' 'web' 'ig' 'fastwebnet' 'outlook' 'tin' 'volny' 'tim' 'email' 'tiscali' 'vodafone' 'tele2' 'virgilio' 'poste'] [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27]
Predspracovanie atribútu "browser_name"¶
Dataset ma špecifikovaný stĺpec "browser_name", kde sú uložené používateľom využívané prehliadavače. Hodnoty sú uložené v stringoch a len si ich namapujeme na čísla.
x_train["browser_name"].head()
3617 chrome 4552 edge 7333 mobile 2127 edge 4591 edge Name: browser_name, dtype: object
x_train["browser_name"].describe()
count 8750 unique 7 top chrome freq 2656 Name: browser_name, dtype: object
x_train["browser_name"].unique()
array(['chrome', 'edge', 'mobile', 'firefox', 'opera', 'safari', 'other'],
dtype=object)
def preprocess_browser_name(x_train):
x_train = encode_with_label_encoder(x_train, "browser_name")
return x_train
x_test = preprocess_browser_name(x_test)
x_train = preprocess_browser_name(x_train)
{'chrome': 0, 'edge': 1, 'firefox': 2, 'mobile': 3, 'opera': 4, 'other': 5, 'safari': 6}
{'chrome': 0, 'edge': 1, 'firefox': 2, 'mobile': 3, 'opera': 4, 'other': 5, 'safari': 6}
Zdôvodnenie¶
OneHotEncoder¶
Tento kódovač bol použitý pre stĺpec s hodnotami pohlavia používateľa, pretože sám nahrádza hodnoty NaN (nevyvoláva chybu) a je špecializovaný na binárne stĺpce, ako je tento
LabelEncoder¶
Tento Encoder bol použitý na dva stĺpce: race a browser_name, pretože majú viac ako 2 kategórie, ale zároveň každá z kategórií mala rovnakú váhu, takže tu nebola použitá výhoda LabelEncoder na kódovanie podľa váhy
Kódovanie dátumov¶
Pre stĺpce session_start a birthdate sme použili kódovanie dátumu, pričom v stĺpci session_start sme vybrali dni výždni (od 0 do 6, kde 5 a 6 sú víkendy) a potom sme určili, či ide o víkend, delením so zaokrúhlením poradového čísla dňa (napr. 4 // 5 = 0, 5 // 5 = 1). Pre dátum narodenia sme vzali dátum, ktorý tam bol, vzali sme dnešný dátum a od dnešného dátumu sme odpočítali aktuálny dátum, aby sme dostali roky.
Výber atribútov pre strojové učenie¶
Peter Bartoš
def preprocess_columns(data):
data = preprocess_browser_name(data)
data = preprocess_email(data)
data = preprocess_birthdate(data)
data = preprocess_race(data)
data = preprocess_sex(data)
data = preprocess_session_start(data)
return data
V datasete sa množstvo pomocných atribútov k atribútu ack. Ack predstavuje, že či koniec koncov používateľ niečo na stránke kúpil a ostatné nám môžu povedať, že čo ho k tomu viedlo. Po vykreslení heatmapy uvidíme, že ktoré atribúty korelujú viac, ktoré menej a podľa toho vyberieme najinformatívnejšie atribúty.
selection = df.copy()
selection = preprocess_browser_name(selection)
selection = preprocess_email(selection)
selection = preprocess_birthdate(selection)
selection = preprocess_race(selection)
selection = preprocess_sex(selection)
selection = preprocess_session_start(selection)
figure, ax = plt.subplots(figsize=(20,16))
sns.heatmap(selection.corr(),ax=ax, annot =True, fmt = ".3f")
{'chrome': 0, 'edge': 1, 'firefox': 2, 'mobile': 3, 'opera': 4, 'other': 5, 'safari': 6}
['bol' 'fastwebnet' 'ig' 'yahoo' 'googlemail' 'hotmail' 'tiscali' 'live'
'gmail' 'outlook' 'libero' 'email' 'virgilio' 'aol' 'tin' 'gmx' 'centrum'
'volny' 'web' 'seznam' 'vodafone' 'post' 'alice' 'tim' 'uol' 'chello'
'tele2' 'poste']
[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25 26 27]
{'asian': 0, 'black': 1, 'indian': 2, 'unspecified': 3, 'white': 4, nan: 5}
<Axes: >
Vyhodíme stĺpce z datasetu, ktoré nemajú nejakú extra korelačnú hodnotu (browser_name, sex, birthdate,..) a znova vykreslíme pomocou heatmapy korelácie.
selection = selection.drop(["race", "is_weekend", "sex", "birthdate", "mail", "browser_name"], axis=1)
figure, ax = plt.subplots(figsize=(20,16))
sns.heatmap(selection.corr(),ax=ax, annot =True, fmt = ".3f")
<Axes: >
Korelácie k atribútu ack sú celkom výrazné, hlavne pri "pct_mouse_click" stĺpci. Najbližší ďalší je "mouse_move_total_rel_distance", ktorý má už značne menšiu koreláciu.
cor_list = abs(selection.corr()["ack"])
cor_list.sort_values(ascending=False)
ack 1.000000 pct_mouse_click 0.697991 mouse_move_total_rel_distance 0.409943 pct_doubleclick 0.394063 pct_click 0.387671 scroll_move_total_rel_distance 0.382251 wild_mouse_duration 0.368472 pct_input 0.214083 Name: ack, dtype: float64
Uistíme sa, že všetky stĺpce sú numerické a následne si vypíšeme aj ich typy.
selection = selection.apply(pd.to_numeric, errors='coerce')
selection.dtypes
selection.columns
Index(['scroll_move_total_rel_distance', 'wild_mouse_duration', 'pct_click',
'pct_input', 'pct_mouse_click', 'ack', 'pct_doubleclick',
'mouse_move_total_rel_distance'],
dtype='object')
Použijeme recursive feature elimination (RFE) ako metódu na feature selection.
estimator = SVR(kernel="linear")
selector = RFE(estimator, n_features_to_select=7, step=1)
selector = selector.fit(selection.drop("ack", axis=1), selection["ack"])
print(selector.support_)
print(selector.ranking_)
[ True True True True True True True] [1 1 1 1 1 1 1]
Následný graf zobrazí informatívnosť jednotlivých atribútov voči stĺpcu "ack". Graf vykreslil, že jediný atribút s rozumnou koreláciou je "pct_input".
selector = SelectKBest(f_regression, k=7)
X_selected = selector.fit_transform(selection.drop("ack", axis=1), selection["ack"])
selected_feature_names = selection.drop("ack", axis=1).columns[selector.get_support()]
pvalues = -np.log10(selector.pvalues_)
print(pvalues)
plt.figure(figsize=(10, 10))
plt.bar(range(len(selected_feature_names)), pvalues)
plt.xticks(range(len(selected_feature_names)), selected_feature_names, rotation='vertical')
plt.xlabel('Selected Features')
plt.ylabel('p-values')
plt.title('Feature Selection Results')
plt.show()
[ inf inf inf 111.29744687 inf
inf inf]
C:\Users\peter\AppData\Local\Temp\ipykernel_34140\3329772554.py:4: RuntimeWarning: divide by zero encountered in log10 pvalues = -np.log10(selector.pvalues_)
Graf nezobrazil nejaké extra dobré výsledky a kvôli zlým p-hodnotám nevedel vykresliť ostatné atribúty. Jednotlivé atribúty aj tak vyberieme pre ďalšiu fázu.
Replikovateľnosť predspracovania¶
Hlib Kokin
Definujme tranformátory¶
class CustomTransformer(BaseEstimator, TransformerMixin):
def __init__(self, cols):
self.cols = cols
def fit(self, X, y = None):
return X
def transform(self, df, y = None):
# odstranenie nepotrebnych slpcov
df = drop_low_cor_cols(df)
# riesenie null hodnot
df = fillout_with_mean(df)
df = fillout_with_median(df)
df = fillout_with_knn(df)
# odstranenie duplikatov
df = df.drop_duplicates()
# riesenie outlierov
for col in self.cols:
df = identify_and_replace_outliers(df, col_name)
# riesenie categorickych hodnot s typom object
df = preprocess_session_start(df)
df = preprocess_sex(df)
df = preprocess_race(df)
df = preprocess_birthdate(df)
df = preprocess_email(df)
df = preprocess_browser_name(df)
return df
Create test dataframe for pipeline¶
test = pd.merge(sessions, users, on=["user_id", "user_id"])
test.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 11048 entries, 0 to 11047 Data columns (total 35 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 product_ean 11048 non-null int64 1 scroll_move_total_rel_distance 11037 non-null float64 2 wild_mouse_duration 11034 non-null float64 3 pct_scroll_move_duration 11040 non-null float64 4 session_duration 11048 non-null float64 5 pct_click 11039 non-null float64 6 total_load_time 11048 non-null float64 7 pct_scroll_move 11035 non-null float64 8 screen_height 11048 non-null object 9 page_activity_duration 11036 non-null float64 10 screen_width 11048 non-null object 11 pct_wild_mouse 11038 non-null float64 12 pct_input 11035 non-null float64 13 pct_mouse_click 11037 non-null float64 14 pct_scrandom 11040 non-null float64 15 ack 11048 non-null float64 16 pct_click_product_info 11037 non-null float64 17 pct_doubleclick 11036 non-null float64 18 pct_mouse_move 11041 non-null float64 19 mouse_move_total_rel_distance 11038 non-null float64 20 session_id 11048 non-null int64 21 pct_rage_click 11040 non-null float64 22 user_id 11048 non-null int64 23 browser_name 11048 non-null object 24 session_start 11048 non-null object 25 name 11048 non-null object 26 job 4442 non-null object 27 sex 5499 non-null object 28 residence 3875 non-null object 29 address 9396 non-null object 30 birthdate 6106 non-null object 31 mail 11048 non-null object 32 username 11048 non-null object 33 current_location 10487 non-null object 34 race 3262 non-null object dtypes: float64(18), int64(3), object(14) memory usage: 3.0+ MB
test_x = test.drop("ack", axis=1)
print(test_x.info())
test_y = test['ack']
print(test_y.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 11048 entries, 0 to 11047 Data columns (total 34 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 product_ean 11048 non-null int64 1 scroll_move_total_rel_distance 11037 non-null float64 2 wild_mouse_duration 11034 non-null float64 3 pct_scroll_move_duration 11040 non-null float64 4 session_duration 11048 non-null float64 5 pct_click 11039 non-null float64 6 total_load_time 11048 non-null float64 7 pct_scroll_move 11035 non-null float64 8 screen_height 11048 non-null object 9 page_activity_duration 11036 non-null float64 10 screen_width 11048 non-null object 11 pct_wild_mouse 11038 non-null float64 12 pct_input 11035 non-null float64 13 pct_mouse_click 11037 non-null float64 14 pct_scrandom 11040 non-null float64 15 pct_click_product_info 11037 non-null float64 16 pct_doubleclick 11036 non-null float64 17 pct_mouse_move 11041 non-null float64 18 mouse_move_total_rel_distance 11038 non-null float64 19 session_id 11048 non-null int64 20 pct_rage_click 11040 non-null float64 21 user_id 11048 non-null int64 22 browser_name 11048 non-null object 23 session_start 11048 non-null object 24 name 11048 non-null object 25 job 4442 non-null object 26 sex 5499 non-null object 27 residence 3875 non-null object 28 address 9396 non-null object 29 birthdate 6106 non-null object 30 mail 11048 non-null object 31 username 11048 non-null object 32 current_location 10487 non-null object 33 race 3262 non-null object dtypes: float64(17), int64(3), object(14) memory usage: 2.9+ MB None <class 'pandas.core.series.Series'> RangeIndex: 11048 entries, 0 to 11047 Series name: ack Non-Null Count Dtype -------------- ----- 11048 non-null float64 dtypes: float64(1) memory usage: 86.4 KB None
Definicia pipelajnu¶
# numericke stlpce, ktore treba osetrit na outliery
cols = ["scroll_move_total_rel_distance", "wild_mouse_duration", "pct_click", "pct_mouse_click", "pct_input",
"pct_doubleclick", "mouse_move_total_rel_distance"]
pipeline = Pipeline(steps = [
('preprocessor', CustomTransformer(cols))
])
Testovanie pipelajny¶
Teraz má pipelajna iba transformátor, ale neskôr tam pridáme aj tréningový model
pipeline.fit(test_x, test_y)
x = pipeline.transform(test_x)
x.info()
{'asian': 0, 'black': 1, 'indian': 2, 'unspecified': 3, 'white': 4, nan: 5}
['bol' 'fastwebnet' 'ig' 'yahoo' 'googlemail' 'hotmail' 'tiscali' 'live'
'gmail' 'outlook' 'libero' 'email' 'virgilio' 'aol' 'tin' 'gmx' 'centrum'
'volny' 'web' 'seznam' 'vodafone' 'post' 'alice' 'tim' 'uol' 'chello'
'tele2' 'poste']
[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25 26 27]
{'chrome': 0, 'edge': 1, 'firefox': 2, 'mobile': 3, 'opera': 4, 'other': 5, 'safari': 6}
<class 'pandas.core.frame.DataFrame'>
Index: 10927 entries, 0 to 11047
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 scroll_move_total_rel_distance 10927 non-null float64
1 wild_mouse_duration 10927 non-null float64
2 pct_click 10927 non-null float64
3 pct_input 10927 non-null float64
4 pct_mouse_click 10927 non-null float64
5 pct_doubleclick 10927 non-null float64
6 mouse_move_total_rel_distance 10927 non-null float64
7 browser_name 10927 non-null int32
8 sex 10927 non-null float64
9 birthdate 10927 non-null int64
10 mail 10927 non-null int64
11 race 10927 non-null int32
12 is_weekend 10927 non-null int32
dtypes: float64(8), int32(3), int64(2)
memory usage: 1.0 MB
transformer = CustomTransformer(cols)
x = transformer.transform(test_x, test_y)
x.info()
{'asian': 0, 'black': 1, 'indian': 2, 'unspecified': 3, 'white': 4, nan: 5}
['bol' 'fastwebnet' 'ig' 'yahoo' 'googlemail' 'hotmail' 'tiscali' 'live'
'gmail' 'outlook' 'libero' 'email' 'virgilio' 'aol' 'tin' 'gmx' 'centrum'
'volny' 'web' 'seznam' 'vodafone' 'post' 'alice' 'tim' 'uol' 'chello'
'tele2' 'poste']
[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25 26 27]
{'chrome': 0, 'edge': 1, 'firefox': 2, 'mobile': 3, 'opera': 4, 'other': 5, 'safari': 6}
<class 'pandas.core.frame.DataFrame'>
Index: 10927 entries, 0 to 11047
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 scroll_move_total_rel_distance 10927 non-null float64
1 wild_mouse_duration 10927 non-null float64
2 pct_click 10927 non-null float64
3 pct_input 10927 non-null float64
4 pct_mouse_click 10927 non-null float64
5 pct_doubleclick 10927 non-null float64
6 mouse_move_total_rel_distance 10927 non-null float64
7 browser_name 10927 non-null int32
8 sex 10927 non-null float64
9 birthdate 10927 non-null int64
10 mail 10927 non-null int64
11 race 10927 non-null int32
12 is_weekend 10927 non-null int32
dtypes: float64(8), int32(3), int64(2)
memory usage: 1.0 MB
test_x.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 11048 entries, 0 to 11047 Data columns (total 34 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 product_ean 11048 non-null int64 1 scroll_move_total_rel_distance 11037 non-null float64 2 wild_mouse_duration 11034 non-null float64 3 pct_scroll_move_duration 11040 non-null float64 4 session_duration 11048 non-null float64 5 pct_click 11039 non-null float64 6 total_load_time 11048 non-null float64 7 pct_scroll_move 11035 non-null float64 8 screen_height 11048 non-null object 9 page_activity_duration 11036 non-null float64 10 screen_width 11048 non-null object 11 pct_wild_mouse 11038 non-null float64 12 pct_input 11035 non-null float64 13 pct_mouse_click 11037 non-null float64 14 pct_scrandom 11040 non-null float64 15 pct_click_product_info 11037 non-null float64 16 pct_doubleclick 11036 non-null float64 17 pct_mouse_move 11041 non-null float64 18 mouse_move_total_rel_distance 11038 non-null float64 19 session_id 11048 non-null int64 20 pct_rage_click 11040 non-null float64 21 user_id 11048 non-null int64 22 browser_name 11048 non-null object 23 session_start 11048 non-null object 24 name 11048 non-null object 25 job 4442 non-null object 26 sex 5499 non-null object 27 residence 3875 non-null object 28 address 9396 non-null object 29 birthdate 6106 non-null object 30 mail 11048 non-null object 31 username 11048 non-null object 32 current_location 10487 non-null object 33 race 3262 non-null object dtypes: float64(17), int64(3), object(14) memory usage: 2.9+ MB
Jednoduchý klasifikátor na základe závislosti v dátach¶
Hlib Kokin
OneR algorithm¶
Zmenime numericke stlpce na kategoricke¶
def get_age_group(age):
if age == -1:
return -1 # NA
elif age <= 20:
return 0 # 'young'
elif age <= 40:
return 1 # 'young adult'
elif age <= 60:
return 2 # 'mature'
else:
return 3 # 'elder'
group1 = [0, 1, 4, 6]
def get_category_for_numerical_1(value): # for columns 0, 1, 4, 6
if value == -1:
return -1 # 'NA'
if value <= 10:
return 0 # low
elif value <= 14:
return 1 # mid'
else:
return 2 # high
group2 = [2, 5]
def get_category_for_numerical_2(value): # for columns 2, 5
if value == -1:
return -1 # 'NA'
if value <= 8:
return 0 # low
elif value <= 13:
return 1 # mid'
else:
return 2 # high
group3 = [3]
def get_category_for_numerical_3(value): # for pct_scroll_move_duration
if value == -1:
return -1 # 'NA'
if value <= 0.5:
return 0 # low
elif value <= 1:
return 1 # mid'
else:
return 2 # high
x_train_OneR = x_train.copy()
x_train_OneR["birthdate"] = x_train_OneR["birthdate"].apply(get_age_group)
x_train_OneR.rename(columns={'birthdate': 'age_category'}, inplace=True)
plt.boxplot(x_train_OneR.iloc[:, 0:7])
{'whiskers': [<matplotlib.lines.Line2D at 0x1b880f0a320>,
<matplotlib.lines.Line2D at 0x1b880f0afe0>,
<matplotlib.lines.Line2D at 0x1b88020b820>,
<matplotlib.lines.Line2D at 0x1b881739900>,
<matplotlib.lines.Line2D at 0x1b880f78d90>,
<matplotlib.lines.Line2D at 0x1b880192a40>,
<matplotlib.lines.Line2D at 0x1b881a12770>,
<matplotlib.lines.Line2D at 0x1b881a110f0>,
<matplotlib.lines.Line2D at 0x1b881941330>,
<matplotlib.lines.Line2D at 0x1b880225f00>,
<matplotlib.lines.Line2D at 0x1b88961e500>,
<matplotlib.lines.Line2D at 0x1b888229240>,
<matplotlib.lines.Line2D at 0x1b883761b40>,
<matplotlib.lines.Line2D at 0x1b883762590>],
'caps': [<matplotlib.lines.Line2D at 0x1b8801d4340>,
<matplotlib.lines.Line2D at 0x1b880d3b7f0>,
<matplotlib.lines.Line2D at 0x1b880f9d4e0>,
<matplotlib.lines.Line2D at 0x1b881b3fee0>,
<matplotlib.lines.Line2D at 0x1b880263970>,
<matplotlib.lines.Line2D at 0x1b880260cd0>,
<matplotlib.lines.Line2D at 0x1b881a12350>,
<matplotlib.lines.Line2D at 0x1b881b7c940>,
<matplotlib.lines.Line2D at 0x1b880f43d60>,
<matplotlib.lines.Line2D at 0x1b887513eb0>,
<matplotlib.lines.Line2D at 0x1b888229300>,
<matplotlib.lines.Line2D at 0x1b88822a7a0>,
<matplotlib.lines.Line2D at 0x1b883763220>,
<matplotlib.lines.Line2D at 0x1b883763c70>],
'boxes': [<matplotlib.lines.Line2D at 0x1b8844160b0>,
<matplotlib.lines.Line2D at 0x1b8895c2470>,
<matplotlib.lines.Line2D at 0x1b880f20d00>,
<matplotlib.lines.Line2D at 0x1b881a11fc0>,
<matplotlib.lines.Line2D at 0x1b881943d30>,
<matplotlib.lines.Line2D at 0x1b88961fbb0>,
<matplotlib.lines.Line2D at 0x1b883761030>],
'medians': [<matplotlib.lines.Line2D at 0x1b8801af3a0>,
<matplotlib.lines.Line2D at 0x1b880f22620>,
<matplotlib.lines.Line2D at 0x1b8802606d0>,
<matplotlib.lines.Line2D at 0x1b8819415d0>,
<matplotlib.lines.Line2D at 0x1b88961e590>,
<matplotlib.lines.Line2D at 0x1b88822b430>,
<matplotlib.lines.Line2D at 0x1b883761d50>],
'fliers': [<matplotlib.lines.Line2D at 0x1b881762020>,
<matplotlib.lines.Line2D at 0x1b880f23c10>,
<matplotlib.lines.Line2D at 0x1b881a116c0>,
<matplotlib.lines.Line2D at 0x1b881943940>,
<matplotlib.lines.Line2D at 0x1b88961d7e0>,
<matplotlib.lines.Line2D at 0x1b888228160>,
<matplotlib.lines.Line2D at 0x1b88379cac0>],
'means': []}
for feature in x_train_OneR.iloc[:, group1]:
x_train_OneR[feature] = x_train_OneR[feature].apply(get_category_for_numerical_1)
for feature in x_train_OneR.iloc[:, group2]:
x_train_OneR[feature] = x_train_OneR[feature].apply(get_category_for_numerical_2)
for feature in x_train_OneR.iloc[:, group3]:
x_train_OneR[feature] = x_train_OneR[feature].apply(get_category_for_numerical_3)
plt.boxplot(x_train_OneR.iloc[:, 0:7])
{'whiskers': [<matplotlib.lines.Line2D at 0x1b8881d6710>,
<matplotlib.lines.Line2D at 0x1b8881d63b0>,
<matplotlib.lines.Line2D at 0x1b8881d5de0>,
<matplotlib.lines.Line2D at 0x1b8817e2b00>,
<matplotlib.lines.Line2D at 0x1b888112260>,
<matplotlib.lines.Line2D at 0x1b888112620>,
<matplotlib.lines.Line2D at 0x1b8894c9d80>,
<matplotlib.lines.Line2D at 0x1b8894cbe20>,
<matplotlib.lines.Line2D at 0x1b8894c8040>,
<matplotlib.lines.Line2D at 0x1b8894ca3b0>,
<matplotlib.lines.Line2D at 0x1b8894a3c10>,
<matplotlib.lines.Line2D at 0x1b889560ac0>,
<matplotlib.lines.Line2D at 0x1b8895622c0>,
<matplotlib.lines.Line2D at 0x1b889561630>],
'caps': [<matplotlib.lines.Line2D at 0x1b8881d6140>,
<matplotlib.lines.Line2D at 0x1b8881d5330>,
<matplotlib.lines.Line2D at 0x1b887ca1ea0>,
<matplotlib.lines.Line2D at 0x1b8876b6a10>,
<matplotlib.lines.Line2D at 0x1b888112650>,
<matplotlib.lines.Line2D at 0x1b8894c9570>,
<matplotlib.lines.Line2D at 0x1b8894cace0>,
<matplotlib.lines.Line2D at 0x1b8894c93f0>,
<matplotlib.lines.Line2D at 0x1b8894c9180>,
<matplotlib.lines.Line2D at 0x1b8894cb490>,
<matplotlib.lines.Line2D at 0x1b889562620>,
<matplotlib.lines.Line2D at 0x1b889561570>,
<matplotlib.lines.Line2D at 0x1b8895605b0>,
<matplotlib.lines.Line2D at 0x1b889560a00>],
'boxes': [<matplotlib.lines.Line2D at 0x1b8801f00d0>,
<matplotlib.lines.Line2D at 0x1b8881d52a0>,
<matplotlib.lines.Line2D at 0x1b887643100>,
<matplotlib.lines.Line2D at 0x1b8894cbcd0>,
<matplotlib.lines.Line2D at 0x1b8894ca080>,
<matplotlib.lines.Line2D at 0x1b8894a0d30>,
<matplotlib.lines.Line2D at 0x1b889561c90>],
'medians': [<matplotlib.lines.Line2D at 0x1b8881d5ed0>,
<matplotlib.lines.Line2D at 0x1b8876b68f0>,
<matplotlib.lines.Line2D at 0x1b8894cab90>,
<matplotlib.lines.Line2D at 0x1b8894ca2f0>,
<matplotlib.lines.Line2D at 0x1b8894c9210>,
<matplotlib.lines.Line2D at 0x1b889561150>,
<matplotlib.lines.Line2D at 0x1b8895628c0>],
'fliers': [<matplotlib.lines.Line2D at 0x1b8881d5030>,
<matplotlib.lines.Line2D at 0x1b8876b6fb0>,
<matplotlib.lines.Line2D at 0x1b8894cb2e0>,
<matplotlib.lines.Line2D at 0x1b8894cbf10>,
<matplotlib.lines.Line2D at 0x1b8894a3be0>,
<matplotlib.lines.Line2D at 0x1b889561ae0>,
<matplotlib.lines.Line2D at 0x1b889562c50>],
'means': []}
Implementacia klasickeho OneR¶
def train_OneR(features, labels):
frequencyTable = pd.DataFrame(columns=['feature', 'value', 'cnt_pos', 'cnt_neg'])
confusionMatrix = pd.DataFrame(columns=['feature', 'value', 'ack'])
metrics = pd.DataFrame(columns=['feature', 'acc', 'precision', 'recall'])
for feature in features.columns.tolist():
print("Trénujeme " + feature)
cnt_pos = 0
cnt_all = 0
# spravime zbierku frequency table roznych hodnot kazdeho feature
for uni in features[feature].unique():
examples = features[features[feature] == uni]
cnt_posit = len(examples[labels.loc[examples.index] == 1])
cnt_neg = len(examples[labels.loc[examples.index] == 0])
new_row = {
'feature': feature,
'value': uni,
'cnt_pos': cnt_posit, # pocitadlo nakupov pri prvej navsteve a tejto hodnote uni
'cnt_neg': cnt_neg, # analogicky ale pocitadlo pripadov kde nakup sa neuskytocnil
}
cnt_pos += cnt_posit
cnt_all += cnt_posit + cnt_neg
frequencyTable.loc[len(frequencyTable)] = new_row
# naplnime confusuion matrix rules pre aktualny feature, zoberieme najcastejsie predictor
for uni in features[feature].unique():
actual = frequencyTable[(frequencyTable['feature'] == feature) & (frequencyTable['value'] == uni)].iloc[0, :]
ack = -1
# ak frequency table ma v sebe viac pozitivnych vysledkov - pravidlo bude nasledovne:
# pri uni==0 ack vzdy == 1
if actual.cnt_pos >= actual.cnt_neg:
ack = 1
# a naopak
else:
ack = 0
new_row = {
'feature': feature,
'value': uni,
'ack': ack
}
confusionMatrix.loc[len(confusionMatrix)] = new_row
# potom overime percentualne nakolko nasa confusion matrix ma pravdu, cize spravime error calculation pre kazdy feature
cumulative_acc = 0
pocet_values = 0
truePositives = 0
falseNegatives = 0
falsePositives = 0
for uni in features[feature].unique():
actualFrequency = frequencyTable[(frequencyTable['feature'] == feature) & (frequencyTable['value'] == uni)].iloc[0, :]
actualConfusion = confusionMatrix[(frequencyTable['feature'] == feature) & (confusionMatrix['value'] == uni)].iloc[0, :]
acc = 0
# ak pravidlo hovori ze tato hodnota vzdy znamena ze ack = 1 tak spocitame kolko presne pripadov mame co to potvrdzuju
# predicted True
if actualConfusion.ack == 1:
acc = actualFrequency.cnt_pos / (actualFrequency.cnt_pos + actualFrequency.cnt_neg)
# pridame truePositives
truePositives += actualFrequency.cnt_pos
# a falsePositives
falsePositives += actualFrequency.cnt_neg
# Predicted False
else:
acc = actualFrequency.cnt_neg / (actualFrequency.cnt_pos + actualFrequency.cnt_neg)
# pridame falseNegatives
falseNegatives = actualFrequency.cnt_pos
cumulative_acc += acc
pocet_values += 1
# zapiseme vypocitanu acc do nasej tabulky s metrics
new_row = {
'feature': feature,
'acc': cumulative_acc / pocet_values, # accuracy za vsetky values
'precision': truePositives / (truePositives + falsePositives),
'recall': truePositives / (truePositives + falseNegatives)
}
metrics.loc[len(metrics)] = new_row
print(metrics.head(len(metrics)))
# zvolime si ten atribut s najvacsim accuracy
max_acc_index = metrics['acc'].idxmax()
return metrics.loc[max_acc_index]
best_attribute = train_OneR(x_train_OneR, y_train)
best_attribute
Trénujeme scroll_move_total_rel_distance
Trénujeme wild_mouse_duration
Trénujeme pct_click
Trénujeme pct_input
Trénujeme pct_mouse_click
Trénujeme pct_doubleclick
Trénujeme mouse_move_total_rel_distance
Trénujeme browser_name
Trénujeme sex
Trénujeme age_category
Trénujeme mail
Trénujeme race
Trénujeme is_weekend
feature acc precision recall
0 scroll_move_total_rel_distance 0.690566 0.611541 0.822171
1 wild_mouse_duration 0.698277 0.608795 0.835067
2 pct_click 0.762502 0.939237 0.977793
3 pct_input 0.625829 0.607502 0.916893
4 pct_mouse_click 0.805211 0.946295 0.976794
5 pct_doubleclick 0.735892 0.618365 0.906198
6 mouse_move_total_rel_distance 0.724695 0.612066 0.985441
7 browser_name 0.542856 0.549486 1.000000
8 sex 0.554193 0.549486 1.000000
9 age_category 0.548265 0.549486 1.000000
10 mail 0.559147 0.550557 0.994137
11 race 0.546485 0.549486 1.000000
12 is_weekend 0.554675 0.549486 1.000000
feature pct_mouse_click acc 0.805211 precision 0.946295 recall 0.976794 Name: 4, dtype: object
Najlepší atribút je pct_mouse_click s presnosťou 0,804
Implementacia OneR s rozhodnutim na základe viac atribútov¶
def train_OneR_multiple(features, labels):
frequencyTable = pd.DataFrame(columns=['feature', 'value', 'cnt_pos', 'cnt_neg'])
confusionMatrix = pd.DataFrame(columns=['feature', 'value', 'ack'])
metrics = pd.DataFrame(columns=['features', 'acc', 'precision', 'recall'])
# iterujeme cez všetky kombinácie atribútov
for feature_combo in combinations(features.columns, 2): # zmeňte 2 na počet atribútov, ktoré chcete zohľadniť
print("Trénujeme " + str(feature_combo))
cnt_pos = 0
cnt_all = 0
# vytvoríme nový stĺpec predstavujúci kombináciu atribútov
features['combined'] = features[list(feature_combo)].astype(str).agg('-'.join, axis=1)
# vytvoríme tabuľku frekvencií pre kombinované atribúty
for uni in features['combined'].unique():
examples = features[features['combined'] == uni]
cnt_posit = len(examples[labels.loc[examples.index] == 1])
cnt_neg = len(examples[labels.loc[examples.index] == 0])
new_row = {
'feature': feature_combo,
'value': uni,
'cnt_pos': cnt_posit,
'cnt_neg': cnt_neg,
}
cnt_pos += cnt_posit
cnt_all += cnt_posit + cnt_neg
frequencyTable.loc[len(frequencyTable)] = new_row
# vytvoríme maticu zmätok pre kombinované atribúty
for uni in features['combined'].unique():
actual = frequencyTable[(frequencyTable['feature'] == feature_combo) & (frequencyTable['value'] == uni)].iloc[0, :]
ack = -1
if actual.cnt_pos >= actual.cnt_neg:
ack = 1
else:
ack = 0
new_row = {
'feature': feature_combo,
'value': uni,
'ack': ack
}
confusionMatrix.loc[len(confusionMatrix)] = new_row
cumulative_acc = 0
pocet_values = 0
truePositives = 0
falseNegatives = 0
falsePositives = 0
# vypočítame presnosť pre kombinované atribúty
for uni in features['combined'].unique():
actualFrequency = frequencyTable[(frequencyTable['feature'] == feature_combo) & (frequencyTable['value'] == uni)].iloc[0, :]
actualConfusion = confusionMatrix[(frequencyTable['feature'] == feature_combo) & (confusionMatrix['value'] == uni)].iloc[0, :]
acc = 0
# Predicted True
if actualConfusion.ack == 1:
acc = actualFrequency.cnt_pos / (actualFrequency.cnt_pos + actualFrequency.cnt_neg)
# pridame truePositives
truePositives += actualFrequency.cnt_pos
# a falsePositives
falsePositives += actualFrequency.cnt_neg
# Predicted False
else:
acc = actualFrequency.cnt_neg / (actualFrequency.cnt_pos + actualFrequency.cnt_neg)
# pridame falseNegatives
falseNegatives = actualFrequency.cnt_pos
cumulative_acc += acc
pocet_values += 1
new_row = {
'features': feature_combo,
'acc': cumulative_acc / pocet_values,
'precision': truePositives / (truePositives + falsePositives),
'recall': truePositives / (truePositives + falseNegatives)
}
metrics.loc[len(metrics)] = new_row
print(metrics.head(len(metrics)))
max_acc_index = metrics['acc'].idxmax()
return metrics.loc[max_acc_index]
best_attribute_ = train_OneR_multiple(x_train_OneR, y_train)
best_attribute_
Trénujeme ('scroll_move_total_rel_distance', 'wild_mouse_duration')
Trénujeme ('scroll_move_total_rel_distance', 'pct_click')
Trénujeme ('scroll_move_total_rel_distance', 'pct_input')
Trénujeme ('scroll_move_total_rel_distance', 'pct_mouse_click')
Trénujeme ('scroll_move_total_rel_distance', 'pct_doubleclick')
Trénujeme ('scroll_move_total_rel_distance', 'mouse_move_total_rel_distance')
Trénujeme ('scroll_move_total_rel_distance', 'browser_name')
Trénujeme ('scroll_move_total_rel_distance', 'sex')
Trénujeme ('scroll_move_total_rel_distance', 'age_category')
Trénujeme ('scroll_move_total_rel_distance', 'mail')
Trénujeme ('scroll_move_total_rel_distance', 'race')
Trénujeme ('scroll_move_total_rel_distance', 'is_weekend')
Trénujeme ('wild_mouse_duration', 'pct_click')
Trénujeme ('wild_mouse_duration', 'pct_input')
Trénujeme ('wild_mouse_duration', 'pct_mouse_click')
Trénujeme ('wild_mouse_duration', 'pct_doubleclick')
Trénujeme ('wild_mouse_duration', 'mouse_move_total_rel_distance')
Trénujeme ('wild_mouse_duration', 'browser_name')
Trénujeme ('wild_mouse_duration', 'sex')
Trénujeme ('wild_mouse_duration', 'age_category')
Trénujeme ('wild_mouse_duration', 'mail')
Trénujeme ('wild_mouse_duration', 'race')
Trénujeme ('wild_mouse_duration', 'is_weekend')
Trénujeme ('pct_click', 'pct_input')
Trénujeme ('pct_click', 'pct_mouse_click')
Trénujeme ('pct_click', 'pct_doubleclick')
Trénujeme ('pct_click', 'mouse_move_total_rel_distance')
Trénujeme ('pct_click', 'browser_name')
Trénujeme ('pct_click', 'sex')
Trénujeme ('pct_click', 'age_category')
Trénujeme ('pct_click', 'mail')
Trénujeme ('pct_click', 'race')
Trénujeme ('pct_click', 'is_weekend')
Trénujeme ('pct_input', 'pct_mouse_click')
Trénujeme ('pct_input', 'pct_doubleclick')
Trénujeme ('pct_input', 'mouse_move_total_rel_distance')
Trénujeme ('pct_input', 'browser_name')
Trénujeme ('pct_input', 'sex')
Trénujeme ('pct_input', 'age_category')
Trénujeme ('pct_input', 'mail')
Trénujeme ('pct_input', 'race')
Trénujeme ('pct_input', 'is_weekend')
Trénujeme ('pct_mouse_click', 'pct_doubleclick')
Trénujeme ('pct_mouse_click', 'mouse_move_total_rel_distance')
Trénujeme ('pct_mouse_click', 'browser_name')
Trénujeme ('pct_mouse_click', 'sex')
Trénujeme ('pct_mouse_click', 'age_category')
Trénujeme ('pct_mouse_click', 'mail')
Trénujeme ('pct_mouse_click', 'race')
Trénujeme ('pct_mouse_click', 'is_weekend')
Trénujeme ('pct_doubleclick', 'mouse_move_total_rel_distance')
Trénujeme ('pct_doubleclick', 'browser_name')
Trénujeme ('pct_doubleclick', 'sex')
Trénujeme ('pct_doubleclick', 'age_category')
Trénujeme ('pct_doubleclick', 'mail')
Trénujeme ('pct_doubleclick', 'race')
Trénujeme ('pct_doubleclick', 'is_weekend')
Trénujeme ('mouse_move_total_rel_distance', 'browser_name')
Trénujeme ('mouse_move_total_rel_distance', 'sex')
Trénujeme ('mouse_move_total_rel_distance', 'age_category')
Trénujeme ('mouse_move_total_rel_distance', 'mail')
Trénujeme ('mouse_move_total_rel_distance', 'race')
Trénujeme ('mouse_move_total_rel_distance', 'is_weekend')
Trénujeme ('browser_name', 'sex')
Trénujeme ('browser_name', 'age_category')
Trénujeme ('browser_name', 'mail')
Trénujeme ('browser_name', 'race')
Trénujeme ('browser_name', 'is_weekend')
Trénujeme ('sex', 'age_category')
Trénujeme ('sex', 'mail')
Trénujeme ('sex', 'race')
Trénujeme ('sex', 'is_weekend')
Trénujeme ('age_category', 'mail')
Trénujeme ('age_category', 'race')
Trénujeme ('age_category', 'is_weekend')
Trénujeme ('mail', 'race')
Trénujeme ('mail', 'is_weekend')
Trénujeme ('race', 'is_weekend')
features acc precision \
0 (scroll_move_total_rel_distance, wild_mouse_du... 0.830729 0.716963
1 (scroll_move_total_rel_distance, pct_click) 0.781820 0.919981
2 (scroll_move_total_rel_distance, pct_input) 0.822459 0.709790
3 (scroll_move_total_rel_distance, pct_mouse_click) 0.871456 0.939879
4 (scroll_move_total_rel_distance, pct_doubleclick) 0.824430 0.710916
.. ... ... ...
73 (age_category, race) 0.562863 0.554885
74 (age_category, is_weekend) 0.553394 0.549486
75 (mail, race) 0.616972 0.558287
76 (mail, is_weekend) 0.573574 0.551947
77 (race, is_weekend) 0.549295 0.549486
recall
0 0.998173
1 0.997921
2 0.999453
3 0.999037
4 0.994700
.. ...
73 0.994804
74 1.000000
75 0.999781
76 0.998515
77 1.000000
[78 rows x 4 columns]
features (scroll_move_total_rel_distance, pct_mouse_click) acc 0.871456 precision 0.939879 recall 0.999037 Name: 3, dtype: object
Najlepšíe atribúty su pct_mouse_click a wild_mouse_duration s presnosťou 0.829¶
Vyhodnotenie klasifikátorov¶
Model, ktorý zohľadňoval viacero atribútov súčasne, mal vyššiu presnosť (~0,87 pre atribúty pct_mouse_click, mouse_move_total_rel_distance) ako model, ktorý zohľadňoval každý atribút samostatne (~0,80). Okrem toho mal moddel s jednotlivými atribútmi vyššiu precision (~0,941987) a nižšiu recall (~0,977479) ako druhý model s vyslednymi atribútmi precision (~0,844995) recall (~0,992415). 579
Ale aj napriek tomu, že spomienka a presnosť boli odlišné, boli rovnako ako presnosť modelu na dostatočne vysokej úrovni, aby sa dal použiť
Trénovanie a vyhodnotenie klasifikátorov strojového učenia¶
Peter Bartoš
Trénovacie a testovacie dáta¶
train_labels = y_train.copy()
train_labels.head()
3617 0.0 4552 1.0 7333 0.0 2127 1.0 4591 1.0 Name: ack, dtype: float64
train_data = x_train.copy()
train_data.head()
| scroll_move_total_rel_distance | wild_mouse_duration | pct_click | pct_input | pct_mouse_click | pct_doubleclick | mouse_move_total_rel_distance | browser_name | sex | birthdate | race | is_weekend | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3617 | 15.37356 | 12.64546 | 11.70060 | 0.12209 | 13.18518 | 8.90191 | 13.12975 | 0 | 0.0 | 93 | 0 | 5 | 0 |
| 4552 | 10.17630 | 14.56998 | 16.07030 | 0.83816 | 8.78840 | 7.69944 | 12.56797 | 1 | 1.0 | 30 | 1 | 5 | 0 |
| 7333 | 14.93004 | 12.39987 | 11.33534 | 0.09540 | 14.69664 | 9.21089 | 13.01998 | 3 | 0.0 | 29 | 2 | 5 | 1 |
| 2127 | 9.93342 | 13.23541 | 12.76841 | 0.22075 | 9.76927 | 10.87443 | 13.57756 | 1 | 0.0 | -1 | 1 | 5 | 0 |
| 4591 | 7.60152 | 15.75110 | 14.07854 | 2.73049 | 11.54801 | 10.62808 | 14.54479 | 1 | 0.0 | 115 | 2 | 5 | 1 |
test_labels = y_test.copy()
test_labels.head()
698 0.0 8067 0.0 10519 1.0 2058 0.0 174 0.0 Name: ack, dtype: float64
test_data = x_test.copy()
test_data.head()
| scroll_move_total_rel_distance | wild_mouse_duration | pct_click | pct_input | pct_mouse_click | pct_doubleclick | mouse_move_total_rel_distance | browser_name | sex | birthdate | race | is_weekend | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 698 | 13.86962 | 15.43291 | 8.51178 | 1.98612 | 11.89460 | 11.69669 | 11.73783 | 5 | 0.0 | 32 | 0 | 5 | 0 |
| 8067 | 19.38146 | 7.74124 | 9.52298 | 0.00072 | 12.08462 | 11.78358 | 6.81960 | 1 | 1.0 | 65 | 1 | 5 | 1 |
| 10519 | 9.24959 | 14.79667 | 14.20263 | 1.05148 | 9.94741 | 10.04100 | 13.34917 | 1 | 0.0 | 17 | 2 | 1 | 0 |
| 2058 | 11.08665 | 14.01664 | 11.08920 | 0.48218 | 16.43056 | 16.27889 | 8.91683 | 0 | 0.0 | -1 | 3 | 5 | 1 |
| 174 | 15.15629 | 12.71154 | 9.81530 | 0.13034 | 12.61486 | 12.60312 | 9.78779 | 5 | 0.0 | -1 | 4 | 3 | 1 |
Decision Tree Classifier¶
Využijeme DecisionTreeClassifier pre trénovanie a vyhodnotenie klasifikátora strojového učenia. Tento algoritmus by potenciálne mal vyprodukovať lepšie výsledky ako náš OneR algoritmus.
dtc = DecisionTreeClassifier()
predicted_t = dtc.fit(train_data, train_labels).predict(test_data)
print("Accuracy:", accuracy_score(test_labels, predicted_t))
Accuracy: 0.9508599508599509
print(classification_report(test_labels, predicted_t, target_names=["0", "1"]))
precision recall f1-score support
0 0.95 0.94 0.95 932
1 0.95 0.96 0.95 1103
accuracy 0.95 2035
macro avg 0.95 0.95 0.95 2035
weighted avg 0.95 0.95 0.95 2035
mat = confusion_matrix(test_labels, predicted_t)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label');
# !!! Zakomentované, ak treba vyzualizovať strom, tak stačí iba odkomentovať !!!
# from sklearn.tree import export_graphviz
# from graphviz import Source
# from IPython.display import SVG
# graph = Source(export_graphviz(tree,
# out_file=None,
# feature_names=test_data.columns,
# class_names=["no", "yes"],
# filled = True))
# display(SVG(graph.pipe(format='svg')))
# from IPython.display import HTML # toto je tu len pre to aby sa mi obrazok zmestil na obrazovku
# style = "<style>svg{width:100% !important;height:70% !important;}</style>"
# HTML(style)
# tree.plot_tree(dtc)
Random Forest Classifier¶
Využijeme RandomForestClassifier pre trénovanie a vyhodnotenie klasifikátora strojového učenia. Tento algoritmus by potenciálne mal vyprodukovať lepšie výsledky ako náš OneR algoritmus.
forest = RandomForestClassifier()
predicted_f = forest.fit(train_data, train_labels).predict(test_data)
print("Accuracy:", accuracy_score(test_labels, predicted_f))
Accuracy: 0.9690417690417691
print(classification_report(test_labels, predicted_f, target_names=["0", "1"]))
precision recall f1-score support
0 0.98 0.95 0.97 932
1 0.96 0.98 0.97 1103
accuracy 0.97 2035
macro avg 0.97 0.97 0.97 2035
weighted avg 0.97 0.97 0.97 2035
mat = confusion_matrix(test_labels, predicted_f)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label');
Porovnanie metrík algoritmov¶
print("Metriky prvého OneR algoritmu:\n", best_attribute)
Metriky prvého OneR algoritmu: feature pct_mouse_click acc 0.805211 precision 0.946295 recall 0.976794 Name: 4, dtype: object
print("Metriky druhého OneR algoritmu:\n", best_attribute_)
Metriky druhého OneR algoritmu: features (scroll_move_total_rel_distance, pct_mouse_click) acc 0.871456 precision 0.939879 recall 0.999037 Name: 3, dtype: object
print("Metriky DecisionTreeClassifier algoritmu:\n", classification_report(test_labels, predicted_t, target_names=["0","1"]))
Metriky DecisionTreeClassifier algoritmu:
precision recall f1-score support
0 0.95 0.94 0.95 932
1 0.95 0.96 0.95 1103
accuracy 0.95 2035
macro avg 0.95 0.95 0.95 2035
weighted avg 0.95 0.95 0.95 2035
print("Metriky RandomForestClassifier algoritmu:\n", classification_report(test_labels, predicted_f, target_names=["0","1"]))
Metriky RandomForestClassifier algoritmu:
precision recall f1-score support
0 0.98 0.95 0.97 932
1 0.96 0.98 0.97 1103
accuracy 0.97 2035
macro avg 0.97 0.97 0.97 2035
weighted avg 0.97 0.97 0.97 2035
Z vypísaných metrík týchto algoritmov vyzerá najlepšie RandomForestClassifier potom DecisionTreeClassifier a potom naše OneR algoritmy, čo bolo očakávané.
Optimalizácia alias hyperparameter tuning¶
Hlib Kokin
Preskúmanie Random Forest a hyperparametrov¶
RandomForest je ako les stromov, kde každý strom je ako malá časť celej lesnej rodiny. Každý strom v tomto lese je rozhodovací strom, ktorý sa učí, ako rozhodovať o výsledku na základe rôznych vlastností alebo vstupných údajov.
Teraz, kým každý strom môže byť dosť dobrý sám o sebe, získame silu tým, že ich kombinujeme. Ako keď sa v lese konzultujú rôzne stromy a rozhodujú o tom istom probléme. To znižuje pravdepodobnosť, že sa zmýlia, a dáva nám robustnejší a presnejší výsledek.
Teraz, čo sa týka hyperparametrov, to sú tie veci, ktoré môžeme nastaviť na lepšiu výkonnosť našich stromov. Niektoré z najdôležitejších hyperparametrov RandomForest od sklearn sú:
n_estimators: To je jednoducho počet stromov v "lesnom tíme". Čím viac stromov, tým lepšia generalizácia a stabilnejší výsledok, ale na druhej strane to môže trvať dlhšie, najmä ak je dataset veľký.
max_depth: Toto nastavenie určuje, ako hlboko môže ísť každý strom. Ak je príliš vysoké, môže naraziť na pretrénovanie, kedy sa model príliš prispôsobí trénovacím dátam. Naopak, ak je príliš nízke, stromy môžu byť príliš jednoduché na zachytenie komplexných vzorov v dátach.
min_samples_split: Je to minimálny počet príkladov, ktorý potrebuješ na to, aby sa uzol rozdelil na nové vetvy. Ak táto hodnota nie je dosiahnutá, vetva sa nevytvorí. To je užitočné na kontrolu toho, ako rýchlo sa model môže prispôsobiť menším vzorom alebo outlierom.
min_samples_leaf: Tento hyperparameter určuje minimálny počet príkladov v listových uzloch. Ak je táto hodnota príliš nízka, môže skončiť s veľmi malými a špecifickými vetvami, čo môže viesť k pretrénovaniu. Naopak, vyššia hodnota môže zabezpečiť, že listy sú robustnejšie a generalizujú sa lepšie na nové dáta.
max_features: Toto určuje, koľko funkcií (príznakov) sa má zvážiť pri hľadaní najlepšieho rozdelenia uzla. Vyššia hodnota môže viesť k väčšej variabilite, ale tiež k náročnejšiemu trénovaniu modelu.alým vetvám.
Dôvod výberu RandomForest:
Spracovanie rôznych typov príznakov: RandomForest sa výborne vysporiada s kombináciou numerických a kategorických príznakov. Má schopnosť automaticky spracovať rôzne typy dát, čo uľahčuje prácu s viacnásobnými formátmi v datasete.
Odolnosť voči pretrénovaniu: Vďaka tomu, že je RandomForest zložený z viacerých stromov, ktoré hlasujú o výsledku, je menej náchylný na pretrénovanie (overfitting). To je užitočné, keď pracuješ s rôznorodými dátami a chceš, aby model dobre generalizoval na nové príklady.
Robustné k dátovým šumom: Ak dataset obsahuje nejaký ten šum alebo nepresnosti, RandomForest je celkom odolný voči nim. Jeho schopnosť hlasovania z rôznych stromov umožňuje ignorovať niektoré chybné alebo odľahlé hodnoty.
Jednoduché ladnie hyperparametrov: Nastavovanie hyperparametrov v RandomForest nie je také ťažké. Máš pár kľúčových parametrov, ktoré je môžne upraviť podľa potreby.
Schopnosť vyhodnotiť dôležitosť príznakov: RandomForest poskytne informácie o tom, ktoré príznaky boli najdôležitejšie pri rozhodovaní. To môže byť užitočné pre interpretáciu výsledkov a pochopenie, ktoré faktory majú najväčší vplyv na rozhodnutia.
Hyperaparameters tuning¶
Parametre GridSearch budú mať tiež predvolené hodnoty
GridSearch pre všetky parametre¶
param_grid = {
'n_estimators': [100, 200, 300],
'min_samples_split': [1, 2, 3],
'min_samples_leaf': [1, 3, 5],
'max_depth': [None, 3, 5, 10],
}
rf = RandomForestClassifier()
# use 3-fold cross validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, return_train_score=True, cv=3, scoring='accuracy')
grid_search.fit(train_data, train_labels)
best_params = grid_search.best_params_
best_score = grid_search.best_score_
best_model = grid_search.best_estimator_
results = grid_search.cv_results_
results_df = pd.DataFrame(results)
desired_columns = ['params', 'mean_test_score', 'std_test_score', 'rank_test_score']
metric_columns = [col for col in results_df.columns if 'mean_test' in col or 'std_test' in col or 'rank_test' in col]
print(results_df[desired_columns + metric_columns])
print("Najlepšie parametre:", best_params)
print("Najlepší skóre na trenovacich dátach:", best_score)
C:\Users\peter\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py:425: FitFailedWarning:
108 fits failed out of a total of 324.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.
Below are more details about the failures:
--------------------------------------------------------------------------------
108 fits failed with the following error:
Traceback (most recent call last):
File "C:\Users\peter\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\peter\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1144, in wrapper
estimator._validate_params()
File "C:\Users\peter\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 637, in _validate_params
validate_parameter_constraints(
File "C:\Users\peter\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'min_samples_split' parameter of RandomForestClassifier must be an int in the range [2, inf) or a float in the range (0.0, 1.0]. Got 1 instead.
warnings.warn(some_fits_failed_message, FitFailedWarning)
C:\Users\peter\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_search.py:976: UserWarning: One or more of the test scores are non-finite: [ nan nan nan 0.96217077 0.96137079 0.96182792
0.96194215 0.96194223 0.96159925 nan nan nan
0.95737089 0.95702796 0.9575994 0.95748513 0.95702803 0.95759944
nan nan nan 0.95634216 0.95679945 0.95599942
0.95622797 0.95599942 0.95531371 nan nan nan
0.93645649 0.93348556 0.93188562 0.93245718 0.93085685 0.93268541
nan nan nan 0.93314246 0.9318855 0.93199982
0.93565642 0.93119991 0.9309712 nan nan nan
0.92742793 0.93199958 0.93222824 0.93268475 0.93280019 0.9298284
nan nan nan 0.94925675 0.94891393 0.94994246
0.94868535 0.94994239 0.94925671 nan nan nan
0.94937106 0.94891385 0.94891381 0.94777109 0.94925667 0.94982815
nan nan nan 0.94994239 0.94834241 0.94902817
0.95005666 0.94857111 0.94811387 nan nan nan
0.95874216 0.95862785 0.95885644 0.95874224 0.95794214 0.95851358
nan nan nan 0.95668526 0.95588507 0.95634224
0.95634216 0.95679933 0.95588496 nan nan nan
0.95508501 0.95508509 0.954628 0.95554229 0.95645648 0.95554222]
warnings.warn(
C:\Users\peter\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_search.py:976: UserWarning: One or more of the train scores are non-finite: [ nan nan nan 1. 1. 1.
0.99954284 0.99977144 1. nan nan nan
0.96994279 0.96965705 0.96977133 0.96977133 0.96954275 0.96942846
nan nan nan 0.96302842 0.96285701 0.96314272
0.9631999 0.96308557 0.96297127 nan nan nan
0.93799996 0.93822826 0.93645681 0.93382815 0.93445686 0.93582831
nan nan nan 0.93702835 0.93434276 0.93485692
0.93862845 0.93662824 0.93565687 nan nan nan
0.93297131 0.93594271 0.93519971 0.93731423 0.93542824 0.93342836
nan nan nan 0.95497116 0.95388558 0.95394264
0.95411402 0.95525695 0.95434268 nan nan nan
0.95325688 0.95388558 0.95405694 0.9525712 0.95411413 0.95479985
nan nan nan 0.95337133 0.95405695 0.95382838
0.9543427 0.95377122 0.95319981 nan nan nan
0.96959992 0.96954277 0.96937134 0.96902847 0.96908557 0.96931418
nan nan nan 0.96445701 0.9646284 0.96445701
0.9646856 0.96451414 0.964457 nan nan nan
0.962057 0.96199987 0.96199985 0.96182842 0.96228558 0.96222843]
warnings.warn(
params mean_test_score \
0 {'max_depth': None, 'min_samples_leaf': 1, 'mi... NaN
1 {'max_depth': None, 'min_samples_leaf': 1, 'mi... NaN
2 {'max_depth': None, 'min_samples_leaf': 1, 'mi... NaN
3 {'max_depth': None, 'min_samples_leaf': 1, 'mi... 0.962171
4 {'max_depth': None, 'min_samples_leaf': 1, 'mi... 0.961371
.. ... ...
103 {'max_depth': 10, 'min_samples_leaf': 5, 'min_... 0.955085
104 {'max_depth': 10, 'min_samples_leaf': 5, 'min_... 0.954628
105 {'max_depth': 10, 'min_samples_leaf': 5, 'min_... 0.955542
106 {'max_depth': 10, 'min_samples_leaf': 5, 'min_... 0.956456
107 {'max_depth': 10, 'min_samples_leaf': 5, 'min_... 0.955542
std_test_score rank_test_score mean_test_score std_test_score \
0 NaN 73 NaN NaN
1 NaN 73 NaN NaN
2 NaN 73 NaN NaN
3 0.004382 1 0.962171 0.004382
4 0.004364 6 0.961371 0.004364
.. ... ... ... ...
103 0.004632 34 0.955085 0.004632
104 0.004209 36 0.954628 0.004209
105 0.004557 31 0.955542 0.004557
106 0.005068 22 0.956456 0.005068
107 0.004776 32 0.955542 0.004776
rank_test_score
0 73
1 73
2 73
3 1
4 6
.. ...
103 34
104 36
105 31
106 22
107 32
[108 rows x 7 columns]
Najlepšie parametre: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Najlepší skóre na trenovacich dátach: 0.9621707744472551
Pozrime sa na Feature_Importances:
# Print feature importances (if applicable, e.g., for tree-based models)
if hasattr(best_model, 'feature_importances_'):
feature_importances = best_model.feature_importances_
print("Feature Importances:")
for feature, importance in zip(train_data.columns, feature_importances):
print(f"{feature}: {importance:.4f}")
Feature Importances: scroll_move_total_rel_distance: 0.1243 wild_mouse_duration: 0.1089 pct_click: 0.1135 pct_input: 0.1157 pct_mouse_click: 0.3019 pct_doubleclick: 0.0626 mouse_move_total_rel_distance: 0.1372 browser_name: 0.0078 sex: 0.0021 birthdate: 0.0103 mail: 0.0090 race: 0.0041 is_weekend: 0.0025
# Spravime prediction
best_model = grid_search.best_estimator_
print
y_pred = grid_search.predict(test_data)
#spravime evaluation
accuracy = accuracy_score(test_labels, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:\n', classification_report(test_labels, y_pred, target_names=["0","1"]))
Accuracy: 0.9676
Classification Report:
precision recall f1-score support
0 0.98 0.95 0.96 932
1 0.96 0.98 0.97 1103
accuracy 0.97 2035
macro avg 0.97 0.97 0.97 2035
weighted avg 0.97 0.97 0.97 2035
Pozrime sa, či nemáme overfitting:
train_predictions = best_model.predict(train_data)
train_accuracy = accuracy_score(train_labels, train_predictions)
# Validation set performance
val_predictions = best_model.predict(test_data)
val_accuracy = accuracy_score(test_labels, val_predictions)
print(f'Training Accuracy: {train_accuracy:.4f}')
print(f'Validation Accuracy: {val_accuracy:.4f}')
Training Accuracy: 1.0000 Validation Accuracy: 0.9676
Tuning by min_samples_split¶
param_grid = {
'n_estimators': [100, 300, 500, 600, 700],
'min_samples_split': [3, 5, 7],
}
rf = RandomForestClassifier()
# use 3-fold cross validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, return_train_score=True, cv=3, scoring='accuracy')
grid_search.fit(train_data, train_labels)
best_params = grid_search.best_params_
best_score = grid_search.best_score_
best_model = grid_search.best_estimator_
results = grid_search.cv_results_
results_df = pd.DataFrame(results)
desired_columns = ['params', 'mean_test_score', 'std_test_score', 'rank_test_score']
metric_columns = [col for col in results_df.columns if 'mean_test' in col or 'std_test' in col or 'rank_test' in col]
print(results_df[desired_columns + metric_columns])
print("Najlepšie parametre:", best_params)
print("Najlepší skóre na trenovacich dátach:", best_score)
params mean_test_score \
0 {'min_samples_split': 3, 'n_estimators': 100} 0.962171
1 {'min_samples_split': 3, 'n_estimators': 300} 0.961028
2 {'min_samples_split': 3, 'n_estimators': 500} 0.960914
3 {'min_samples_split': 3, 'n_estimators': 600} 0.961485
4 {'min_samples_split': 3, 'n_estimators': 700} 0.961371
5 {'min_samples_split': 5, 'n_estimators': 100} 0.960456
6 {'min_samples_split': 5, 'n_estimators': 300} 0.959771
7 {'min_samples_split': 5, 'n_estimators': 500} 0.960228
8 {'min_samples_split': 5, 'n_estimators': 600} 0.959428
9 {'min_samples_split': 5, 'n_estimators': 700} 0.959657
10 {'min_samples_split': 7, 'n_estimators': 100} 0.959314
11 {'min_samples_split': 7, 'n_estimators': 300} 0.958856
12 {'min_samples_split': 7, 'n_estimators': 500} 0.958971
13 {'min_samples_split': 7, 'n_estimators': 600} 0.958628
14 {'min_samples_split': 7, 'n_estimators': 700} 0.958856
std_test_score rank_test_score mean_test_score std_test_score \
0 0.005055 1 0.962171 0.005055
1 0.005047 4 0.961028 0.005047
2 0.004959 5 0.960914 0.004959
3 0.004833 2 0.961485 0.004833
4 0.004532 3 0.961371 0.004532
5 0.005047 6 0.960456 0.005047
6 0.004497 8 0.959771 0.004497
7 0.004789 7 0.960228 0.004789
8 0.004921 10 0.959428 0.004921
9 0.004626 9 0.959657 0.004626
10 0.004556 11 0.959314 0.004556
11 0.004632 13 0.958856 0.004632
12 0.004496 12 0.958971 0.004496
13 0.004776 15 0.958628 0.004776
14 0.004632 13 0.958856 0.004632
rank_test_score
0 1
1 4
2 5
3 2
4 3
5 6
6 8
7 7
8 10
9 9
10 11
11 13
12 12
13 15
14 13
Najlepšie parametre: {'min_samples_split': 3, 'n_estimators': 100}
Najlepší skóre na trenovacich dátach: 0.9621706960709488
# Spravime prediction
best_model = grid_search.best_estimator_
print
y_pred = grid_search.predict(test_data)
#spravime evaluation
accuracy = accuracy_score(test_labels, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:\n', classification_report(test_labels, y_pred, target_names=["0","1"]))
Accuracy: 0.9676
Classification Report:
precision recall f1-score support
0 0.98 0.95 0.96 932
1 0.96 0.98 0.97 1103
accuracy 0.97 2035
macro avg 0.97 0.97 0.97 2035
weighted avg 0.97 0.97 0.97 2035
Tuning by criterion¶
param_grid = {
'n_estimators': [100, 200, 300],
'max_features': ['sqrt', 'log2'],
'max_depth' : [4,5,6,7,8],
'criterion' :['gini', 'entropy']
}
rf = RandomForestClassifier()
# use 3-fold cross validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, return_train_score=True, cv=3, scoring='accuracy')
grid_search.fit(train_data, train_labels)
best_params = grid_search.best_params_
best_score = grid_search.best_score_
best_model = grid_search.best_estimator_
results = grid_search.cv_results_
results_df = pd.DataFrame(results)
desired_columns = ['params', 'mean_test_score', 'std_test_score', 'rank_test_score']
metric_columns = [col for col in results_df.columns if 'mean_test' in col or 'std_test' in col or 'rank_test' in col]
print(results_df[desired_columns + metric_columns])
print("Najlepšie parametre:", best_params)
params mean_test_score \
0 {'criterion': 'gini', 'max_depth': 4, 'max_fea... 0.942971
1 {'criterion': 'gini', 'max_depth': 4, 'max_fea... 0.943657
2 {'criterion': 'gini', 'max_depth': 4, 'max_fea... 0.942857
3 {'criterion': 'gini', 'max_depth': 4, 'max_fea... 0.941371
4 {'criterion': 'gini', 'max_depth': 4, 'max_fea... 0.946171
5 {'criterion': 'gini', 'max_depth': 4, 'max_fea... 0.943999
6 {'criterion': 'gini', 'max_depth': 5, 'max_fea... 0.948114
7 {'criterion': 'gini', 'max_depth': 5, 'max_fea... 0.949714
8 {'criterion': 'gini', 'max_depth': 5, 'max_fea... 0.950514
9 {'criterion': 'gini', 'max_depth': 5, 'max_fea... 0.948000
10 {'criterion': 'gini', 'max_depth': 5, 'max_fea... 0.949257
11 {'criterion': 'gini', 'max_depth': 5, 'max_fea... 0.949828
12 {'criterion': 'gini', 'max_depth': 6, 'max_fea... 0.953371
13 {'criterion': 'gini', 'max_depth': 6, 'max_fea... 0.951542
14 {'criterion': 'gini', 'max_depth': 6, 'max_fea... 0.952799
15 {'criterion': 'gini', 'max_depth': 6, 'max_fea... 0.952913
16 {'criterion': 'gini', 'max_depth': 6, 'max_fea... 0.953371
17 {'criterion': 'gini', 'max_depth': 6, 'max_fea... 0.953142
18 {'criterion': 'gini', 'max_depth': 7, 'max_fea... 0.956114
19 {'criterion': 'gini', 'max_depth': 7, 'max_fea... 0.954971
20 {'criterion': 'gini', 'max_depth': 7, 'max_fea... 0.955314
21 {'criterion': 'gini', 'max_depth': 7, 'max_fea... 0.954742
22 {'criterion': 'gini', 'max_depth': 7, 'max_fea... 0.955314
23 {'criterion': 'gini', 'max_depth': 7, 'max_fea... 0.954971
24 {'criterion': 'gini', 'max_depth': 8, 'max_fea... 0.956114
25 {'criterion': 'gini', 'max_depth': 8, 'max_fea... 0.956228
26 {'criterion': 'gini', 'max_depth': 8, 'max_fea... 0.956914
27 {'criterion': 'gini', 'max_depth': 8, 'max_fea... 0.954971
28 {'criterion': 'gini', 'max_depth': 8, 'max_fea... 0.955999
29 {'criterion': 'gini', 'max_depth': 8, 'max_fea... 0.956457
30 {'criterion': 'entropy', 'max_depth': 4, 'max_... 0.946057
31 {'criterion': 'entropy', 'max_depth': 4, 'max_... 0.945142
32 {'criterion': 'entropy', 'max_depth': 4, 'max_... 0.945028
33 {'criterion': 'entropy', 'max_depth': 4, 'max_... 0.940799
34 {'criterion': 'entropy', 'max_depth': 4, 'max_... 0.944914
35 {'criterion': 'entropy', 'max_depth': 4, 'max_... 0.945485
36 {'criterion': 'entropy', 'max_depth': 5, 'max_... 0.947656
37 {'criterion': 'entropy', 'max_depth': 5, 'max_... 0.948342
38 {'criterion': 'entropy', 'max_depth': 5, 'max_... 0.949371
39 {'criterion': 'entropy', 'max_depth': 5, 'max_... 0.948571
40 {'criterion': 'entropy', 'max_depth': 5, 'max_... 0.948228
41 {'criterion': 'entropy', 'max_depth': 5, 'max_... 0.949599
42 {'criterion': 'entropy', 'max_depth': 6, 'max_... 0.951885
43 {'criterion': 'entropy', 'max_depth': 6, 'max_... 0.952799
44 {'criterion': 'entropy', 'max_depth': 6, 'max_... 0.952228
45 {'criterion': 'entropy', 'max_depth': 6, 'max_... 0.952456
46 {'criterion': 'entropy', 'max_depth': 6, 'max_... 0.951085
47 {'criterion': 'entropy', 'max_depth': 6, 'max_... 0.952685
48 {'criterion': 'entropy', 'max_depth': 7, 'max_... 0.954056
49 {'criterion': 'entropy', 'max_depth': 7, 'max_... 0.955314
50 {'criterion': 'entropy', 'max_depth': 7, 'max_... 0.955199
51 {'criterion': 'entropy', 'max_depth': 7, 'max_... 0.954399
52 {'criterion': 'entropy', 'max_depth': 7, 'max_... 0.954856
53 {'criterion': 'entropy', 'max_depth': 7, 'max_... 0.954742
54 {'criterion': 'entropy', 'max_depth': 8, 'max_... 0.955999
55 {'criterion': 'entropy', 'max_depth': 8, 'max_... 0.956571
56 {'criterion': 'entropy', 'max_depth': 8, 'max_... 0.956228
57 {'criterion': 'entropy', 'max_depth': 8, 'max_... 0.955199
58 {'criterion': 'entropy', 'max_depth': 8, 'max_... 0.956571
59 {'criterion': 'entropy', 'max_depth': 8, 'max_... 0.956114
std_test_score rank_test_score mean_test_score std_test_score \
0 0.005032 57 0.942971 0.005032
1 0.003105 56 0.943657 0.003105
2 0.003651 58 0.942857 0.003651
3 0.004355 59 0.941371 0.004355
4 0.004791 49 0.946171 0.004791
5 0.004107 55 0.943999 0.004107
6 0.002532 46 0.948114 0.002532
7 0.003930 39 0.949714 0.003930
8 0.003888 37 0.950514 0.003888
9 0.003155 47 0.948000 0.003155
10 0.002677 42 0.949257 0.002677
11 0.003979 38 0.949828 0.003979
12 0.005236 26 0.953371 0.005236
13 0.004498 35 0.951542 0.004498
14 0.004627 30 0.952799 0.004627
15 0.006026 28 0.952913 0.006026
16 0.004486 25 0.953371 0.004486
17 0.005049 27 0.953142 0.005049
18 0.004682 7 0.956114 0.004682
19 0.004922 17 0.954971 0.004922
20 0.005480 13 0.955314 0.005480
21 0.003926 21 0.954742 0.003926
22 0.005328 13 0.955314 0.005328
23 0.004626 17 0.954971 0.004626
24 0.005069 9 0.956114 0.005069
25 0.005055 5 0.956228 0.005055
26 0.004768 1 0.956914 0.004768
27 0.004777 19 0.954971 0.004777
28 0.004808 10 0.955999 0.004808
29 0.004790 4 0.956457 0.004790
30 0.003930 50 0.946057 0.003930
31 0.003515 52 0.945142 0.003515
32 0.004068 53 0.945028 0.004068
33 0.004812 60 0.940799 0.004812
34 0.004834 54 0.944914 0.004834
35 0.004353 51 0.945485 0.004353
36 0.006610 48 0.947656 0.006610
37 0.004067 44 0.948342 0.004067
38 0.003939 41 0.949371 0.003939
39 0.003956 43 0.948571 0.003956
40 0.004781 45 0.948228 0.004781
41 0.004513 40 0.949599 0.004513
42 0.003787 34 0.951885 0.003787
43 0.004218 29 0.952799 0.004218
44 0.005635 33 0.952228 0.005635
45 0.005608 32 0.952456 0.005608
46 0.004777 36 0.951085 0.004777
47 0.004234 31 0.952685 0.004234
48 0.004632 24 0.954056 0.004632
49 0.005364 12 0.955314 0.005364
50 0.004923 16 0.955199 0.004923
51 0.004960 23 0.954399 0.004960
52 0.005328 20 0.954856 0.005328
53 0.005347 22 0.954742 0.005347
54 0.004768 11 0.955999 0.004768
55 0.004626 2 0.956571 0.004626
56 0.004906 5 0.956228 0.004906
57 0.004768 15 0.955199 0.004768
58 0.006173 3 0.956571 0.006173
59 0.004958 8 0.956114 0.004958
rank_test_score
0 57
1 56
2 58
3 59
4 49
5 55
6 46
7 39
8 37
9 47
10 42
11 38
12 26
13 35
14 30
15 28
16 25
17 27
18 7
19 17
20 13
21 21
22 13
23 17
24 9
25 5
26 1
27 19
28 10
29 4
30 50
31 52
32 53
33 60
34 54
35 51
36 48
37 44
38 41
39 43
40 45
41 40
42 34
43 29
44 33
45 32
46 36
47 31
48 24
49 12
50 16
51 23
52 20
53 22
54 11
55 2
56 5
57 15
58 3
59 8
Najlepšie parametre: {'criterion': 'gini', 'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 300}
print("Najlepšie skóre na trénovacich dátach:", best_score)
Najlepšie skóre na trénovacich dátach: 0.9569136053273316
Výsledky tohto GridSearch sa ukázali byť horšie ako iné, štandardné hodnoty tohto modelu fungujú lepšie.
Keďže najlepší výsledok poskytol model s veľkým počtom n_estimátorov (ostatne parametre - default), vyskúšajme testy s nastavením tohto parametra.
# Spravime prediction
best_model = grid_search.best_estimator_
print
y_pred = grid_search.predict(test_data)
#spravime evaluation
accuracy = accuracy_score(test_labels, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:\n', classification_report(test_labels, y_pred, target_names=["0","1"]))
Accuracy: 0.9597
Classification Report:
precision recall f1-score support
0 0.97 0.94 0.96 932
1 0.95 0.98 0.96 1103
accuracy 0.96 2035
macro avg 0.96 0.96 0.96 2035
weighted avg 0.96 0.96 0.96 2035
# Print feature importances (if applicable, e.g., for tree-based models)
if hasattr(best_model, 'feature_importances_'):
feature_importances = best_model.feature_importances_
print("Feature Importances:")
for feature, importance in zip(train_data.columns, feature_importances):
print(f"{feature}: {importance:.4f}")
Feature Importances: scroll_move_total_rel_distance: 0.1002 wild_mouse_duration: 0.1016 pct_click: 0.1101 pct_input: 0.1163 pct_mouse_click: 0.3714 pct_doubleclick: 0.0530 mouse_move_total_rel_distance: 0.1359 browser_name: 0.0029 sex: 0.0006 birthdate: 0.0030 mail: 0.0029 race: 0.0013 is_weekend: 0.0007
Podľa výsledkov trénovania a testovania vidíme, že všetky predvolené parametre okrem n_estimátorov sú vhodné pre aktuálny súbor údajov, najlepšia hodnota n_estimátorov bola 500, ak zadáte vyššiu hodnotu - model bude overfitnuty
Zaver Hyperparameters tuning¶
Zdá sa nám, že by bolo lepšie použiť model s n_estimators = 500 a min_samples_split = 3, pretože je presnejší,(napríklad pri vyšších hodnotách n_estimators vo výsledku máme overfitting, lebo tej trénovacej moci je príliš veľa a pri vyšších hodnotách min_samples_split model bude priliš presný, čo bude vplyvať na efektívnosť) hoci jeho trénovanie bude trvať trochu dlhšie, ale nespraví to veľký rozdiel, a tiež má zmysel použiť model s vyššou hodnotou min_samples_split (3), aby model prispôsobil menším vzorom alebo outlierom.
Vyhodnotenie vplyvu zvolenej stratégie riešenia na klasifikáciu¶
Peter Bartoš
Načítame si dáta, kde nie sú vymazané outliers a ešte obsahujú chýbajúce hodnoty (nepredspracované dáta):
df_test = drop_low_cor_cols(pd.merge(sessions, users, on=["user_id", "user_id"]))
df_test.head()
| scroll_move_total_rel_distance | wild_mouse_duration | pct_click | pct_input | pct_mouse_click | ack | pct_doubleclick | mouse_move_total_rel_distance | browser_name | session_start | sex | birthdate | race | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 11.76366 | 13.33927 | 14.64515 | 0.24442 | 8.40200 | 1.0 | 8.03629 | 12.18790 | safari | 2019-11-25 19:58:57 | F | NaN | pedro-henriquefarias@bol.com.br | black |
| 1 | 11.06654 | 13.35117 | 13.32630 | 0.24731 | 9.16945 | 1.0 | 9.62225 | 12.85323 | chrome | 2022-06-30 13:36:36 | F | NaN | pedro-henriquefarias@bol.com.br | black |
| 2 | 14.40034 | 16.76548 | 11.92945 | 7.53010 | 12.59008 | 0.0 | 6.60933 | 14.34023 | edge | 2020-08-05 18:12:38 | F | NaN | pedro-henriquefarias@bol.com.br | black |
| 3 | 9.48744 | 9.73660 | 9.26560 | 0.00660 | 9.43816 | 1.0 | 14.85800 | 11.86188 | chrome | 2022-12-20 10:29:17 | F | NaN | pedro-henriquefarias@bol.com.br | black |
| 4 | 10.57724 | 13.65790 | 11.30114 | 0.33680 | 14.44700 | 0.0 | 15.71613 | 7.77553 | safari | 2020-04-30 07:06:58 | F | NaN | pedro-henriquefarias@bol.com.br | black |
Zavedieme si jeden df, ktorý bude mať vyplnené chýbajúceho hodnoty pomocou mean, druhý bude mať vyplnené pomocou KNN a tretí df tiež s KNN:
df1 = fillout_all_with_mean(df_test)
df2 = fillout_all_with_knn(df_test)
df3 = df2.copy()
Vyhodíme duplicitné hodnoty v df1 a nahradíme vychýlené hodnoty v df1. To isté spravíme aj s df2. V df3 nespravíme nič:
df1 = df1.drop_duplicates()
df2 = df2.drop_duplicates()
for col_name in cols:
df1 = identify_and_replace_outliers(df1, col_name)
df2 = identify_and_replace_outliers(df2, col_name)
print("df1:", len(df1), "df2:", len(df2), "df3:", len(df3))
df1: 10785 df2: 10785 df3: 11048
Vykonáme dátovú transformáciu, keďže chceme reprezentovať stringy nejakou číslovou hodnotou:
df1 = preprocess_columns(df1)
df2 = preprocess_columns(df2)
df3 = preprocess_columns(df3)
{'chrome': 0, 'edge': 1, 'firefox': 2, 'mobile': 3, 'opera': 4, 'other': 5, 'safari': 6}
['bol' 'fastwebnet' 'ig' 'yahoo' 'googlemail' 'hotmail' 'tiscali' 'live'
'gmail' 'outlook' 'libero' 'email' 'virgilio' 'aol' 'tin' 'gmx' 'centrum'
'volny' 'web' 'seznam' 'vodafone' 'post' 'alice' 'tim' 'uol' 'chello'
'tele2' 'poste']
[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25 26 27]
{'asian': 0, 'black': 1, 'indian': 2, 'unspecified': 3, 'white': 4, nan: 5}
{'chrome': 0, 'edge': 1, 'firefox': 2, 'mobile': 3, 'opera': 4, 'other': 5, 'safari': 6}
['bol' 'fastwebnet' 'ig' 'yahoo' 'googlemail' 'hotmail' 'tiscali' 'live'
'gmail' 'outlook' 'libero' 'email' 'virgilio' 'aol' 'tin' 'gmx' 'centrum'
'volny' 'web' 'seznam' 'vodafone' 'post' 'alice' 'tim' 'uol' 'chello'
'tele2' 'poste']
[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25 26 27]
{'asian': 0, 'black': 1, 'indian': 2, 'unspecified': 3, 'white': 4, nan: 5}
{'chrome': 0, 'edge': 1, 'firefox': 2, 'mobile': 3, 'opera': 4, 'other': 5, 'safari': 6}
['bol' 'fastwebnet' 'ig' 'yahoo' 'googlemail' 'hotmail' 'tiscali' 'live'
'gmail' 'outlook' 'libero' 'email' 'virgilio' 'aol' 'tin' 'gmx' 'centrum'
'volny' 'web' 'seznam' 'vodafone' 'post' 'alice' 'tim' 'uol' 'chello'
'tele2' 'poste']
[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25 26 27]
{'asian': 0, 'black': 1, 'indian': 2, 'unspecified': 3, 'white': 4, nan: 5}
Zavedieme si ešte df4,df5 a df6, ktorým vyhodíme stĺpce "browser_name", "sex", "birthdate", "mail", "race" a "is_weekend", aby sme získali väčšiu množinu vzoriek na porovnávanie jednotlivých stratégii aj čo sa týka výberu atribútov:
df4 = df1.copy().drop(["browser_name", "sex", "birthdate", "mail", "race", "is_weekend"], axis=1)
df5 = df2.copy().drop(["browser_name", "sex", "birthdate", "mail", "race", "is_weekend"], axis=1)
df6 = df3.copy().drop(["browser_name", "sex", "birthdate", "mail", "race", "is_weekend"], axis=1)
Splitneme si teraz všetky vzorky na trénovanie (rozdelenie atribútom train_size nechám pre df1 rovnaký, ako sme používali my a ostatným pridelým iba random 80%, keďže my sme sa nad train_size-om hlbšie zamýšľali a ostatné slúžia iba na porovnanie nášho vybraného riešenia):
x1_train, x1_test, y1_train, y1_test = train_test_split(df1.drop("ack", axis=1), df1.ack, train_size=0.8114)
x2_train, x2_test, y2_train, y2_test = train_test_split(df2.drop("ack", axis=1), df2.ack, train_size=0.8114)
x3_train, x3_test, y3_train, y3_test = train_test_split(df3.drop("ack", axis=1), df3.ack, train_size=0.8)
x4_train, x4_test, y4_train, y4_test = train_test_split(df4.drop("ack", axis=1), df4.ack, train_size=0.8)
x5_train, x5_test, y5_train, y5_test = train_test_split(df5.drop("ack", axis=1), df5.ack, train_size=0.8114)
x6_train, x6_test, y6_train, y6_test = train_test_split(df6.drop("ack", axis=1), df6.ack, train_size=0.8)
print("df1:", x1_train.shape, x1_test.shape, "\ndf2:", x2_train.shape, x2_test.shape, "\ndf3", x3_train.shape, x3_test.shape)
print("df4", x4_train.shape, x4_test.shape, "\ndf5", x5_train.shape, x5_test.shape, "\ndf6", x6_train.shape, x6_test.shape)
df1: (8750, 13) (2035, 13) df2: (8750, 13) (2035, 13) df3 (8838, 13) (2210, 13) df4 (8628, 7) (2157, 7) df5 (8750, 7) (2035, 7) df6 (8838, 7) (2210, 7)
Teraz si vzorky otestujeme na Decision Tree Classifier a necháme vypísať metriky pre vyvodenie výsledkov:
dtc = DecisionTreeClassifier()
predicted_t = dtc.fit(x1_train, y1_train).predict(x1_test)
print("DF1:\n", classification_report(y1_test, predicted_t, target_names=["0", "1"]), "Accuracy:", accuracy_score(y1_test, predicted_t))
dtc = DecisionTreeClassifier()
predicted_t = dtc.fit(x2_train, y2_train).predict(x2_test)
print("\nDF2:\n", classification_report(y2_test, predicted_t, target_names=["0", "1"]), "Accuracy:", accuracy_score(y2_test, predicted_t))
dtc = DecisionTreeClassifier()
predicted_t = dtc.fit(x3_train, y3_train).predict(x3_test)
print("\nDF3:\n", classification_report(y3_test, predicted_t, target_names=["0", "1"]), "Accuracy:", accuracy_score(y3_test, predicted_t))
dtc = DecisionTreeClassifier()
predicted_t = dtc.fit(x4_train, y4_train).predict(x4_test)
print("\nDF4:\n", classification_report(y4_test, predicted_t, target_names=["0", "1"]), "Accuracy:", accuracy_score(y4_test, predicted_t))
dtc = DecisionTreeClassifier()
predicted_t = dtc.fit(x5_train, y5_train).predict(x5_test)
print("\nDF5:\n", classification_report(y5_test, predicted_t, target_names=["0", "1"]), "Accuracy:", accuracy_score(y5_test, predicted_t))
dtc = DecisionTreeClassifier()
predicted_t = dtc.fit(x6_train, y6_train).predict(x6_test)
print("\nDF6:\n", classification_report(y6_test, predicted_t, target_names=["0", "1"]), "Accuracy:", accuracy_score(y6_test, predicted_t))
DF1:
precision recall f1-score support
0 0.94 0.93 0.94 923
1 0.95 0.95 0.95 1112
accuracy 0.94 2035
macro avg 0.94 0.94 0.94 2035
weighted avg 0.94 0.94 0.94 2035
Accuracy: 0.942997542997543
DF2:
precision recall f1-score support
0 0.94 0.92 0.93 957
1 0.93 0.95 0.94 1078
accuracy 0.94 2035
macro avg 0.94 0.94 0.94 2035
weighted avg 0.94 0.94 0.94 2035
Accuracy: 0.9380835380835381
DF3:
precision recall f1-score support
0 0.94 0.95 0.94 1038
1 0.95 0.94 0.95 1172
accuracy 0.94 2210
macro avg 0.94 0.94 0.94 2210
weighted avg 0.94 0.94 0.94 2210
Accuracy: 0.9434389140271493
DF4:
precision recall f1-score support
0 0.96 0.95 0.95 1010
1 0.95 0.97 0.96 1147
accuracy 0.96 2157
macro avg 0.96 0.96 0.96 2157
weighted avg 0.96 0.96 0.96 2157
Accuracy: 0.9564209550301345
DF5:
precision recall f1-score support
0 0.95 0.95 0.95 890
1 0.96 0.96 0.96 1145
accuracy 0.95 2035
macro avg 0.95 0.95 0.95 2035
weighted avg 0.95 0.95 0.95 2035
Accuracy: 0.9538083538083538
DF6:
precision recall f1-score support
0 0.95 0.94 0.95 1024
1 0.95 0.96 0.96 1186
accuracy 0.95 2210
macro avg 0.95 0.95 0.95 2210
weighted avg 0.95 0.95 0.95 2210
Accuracy: 0.9524886877828054
Sumarizácia nad Decision Tree Classifier vyzerá tak, že najlepšie performoval df4, ktorý mal pomocou mean vyplnené prázdne hodnoty, mal nahredné outliers a bol nad ním spravený výber atribútov. Najhoršie dopadol df2, nad ktorým boli pomocou KNN vyplnené prázdne hodnoty, ale nemal spravený výber atribútov.
Teraz to vyskúšame nad Random Forest Classifier a vyvodíme výsledky z metrík:
forest = RandomForestClassifier()
predicted_f = forest.fit(x1_train, y1_train).predict(x1_test)
print("DF1:\n", classification_report(y1_test, predicted_f, target_names=["0", "1"]), "Accuracy:", accuracy_score(y1_test, predicted_f))
forest = RandomForestClassifier()
predicted_f = forest.fit(x2_train, y2_train).predict(x2_test)
print("DF2:\n", classification_report(y2_test, predicted_f, target_names=["0", "1"]), "Accuracy:", accuracy_score(y2_test, predicted_f))
forest = RandomForestClassifier()
predicted_f = forest.fit(x3_train, y3_train).predict(x3_test)
print("DF3:\n", classification_report(y3_test, predicted_f, target_names=["0", "1"]), "Accuracy:", accuracy_score(y3_test, predicted_f))
forest = RandomForestClassifier()
predicted_f = forest.fit(x4_train, y4_train).predict(x4_test)
print("DF4:\n", classification_report(y4_test, predicted_f, target_names=["0", "1"]), "Accuracy:", accuracy_score(y4_test, predicted_f))
forest = RandomForestClassifier()
predicted_f = forest.fit(x5_train, y5_train).predict(x5_test)
print("DF5:\n", classification_report(y5_test, predicted_f, target_names=["0", "1"]), "Accuracy:", accuracy_score(y5_test, predicted_f))
forest = RandomForestClassifier()
predicted_f = forest.fit(x6_train, y6_train).predict(x6_test)
print("DF6:\n", classification_report(y6_test, predicted_f, target_names=["0", "1"]), "Accuracy:", accuracy_score(y6_test, predicted_f))
DF1:
precision recall f1-score support
0 0.97 0.96 0.97 923
1 0.97 0.98 0.97 1112
accuracy 0.97 2035
macro avg 0.97 0.97 0.97 2035
weighted avg 0.97 0.97 0.97 2035
Accuracy: 0.9690417690417691
DF2:
precision recall f1-score support
0 0.96 0.95 0.96 957
1 0.96 0.97 0.96 1078
accuracy 0.96 2035
macro avg 0.96 0.96 0.96 2035
weighted avg 0.96 0.96 0.96 2035
Accuracy: 0.9606879606879607
DF3:
precision recall f1-score support
0 0.96 0.96 0.96 1038
1 0.97 0.97 0.97 1172
accuracy 0.96 2210
macro avg 0.96 0.96 0.96 2210
weighted avg 0.96 0.96 0.96 2210
Accuracy: 0.9642533936651584
DF4:
precision recall f1-score support
0 0.99 0.97 0.98 1010
1 0.98 0.99 0.98 1147
accuracy 0.98 2157
macro avg 0.98 0.98 0.98 2157
weighted avg 0.98 0.98 0.98 2157
Accuracy: 0.9809921186833566
DF5:
precision recall f1-score support
0 0.98 0.98 0.98 890
1 0.98 0.99 0.99 1145
accuracy 0.98 2035
macro avg 0.98 0.98 0.98 2035
weighted avg 0.98 0.98 0.98 2035
Accuracy: 0.9832923832923833
DF6:
precision recall f1-score support
0 0.98 0.98 0.98 1024
1 0.98 0.98 0.98 1186
accuracy 0.98 2210
macro avg 0.98 0.98 0.98 2210
weighted avg 0.98 0.98 0.98 2210
Accuracy: 0.9796380090497737
Sumarizácia nad Random Forest Classifier vyzerá tak, že najlepšie performoval df5, ktorý mal pomocou KNN vyplnené prázdne hodnoty, mal nahredené outliers a bol nad ním spravený výber atribútov. Najhoršie dopadol df2, nad ktorým boli pomocou KNN vyplnené prázdne hodnoty, ale nemal spravený výber atribútov. Random Forest Classifier je ale o dosť pomalší ako Decision Tree Classifier.
Teraz vyberieme classifier, ktorý nám hodil lepšie metriky a to je Random Forest Classifier a vyskúšame na ňom teraz náš param grid a tým zistíme, že aký to má celkový efekt:
our_best_param_grid = {
'n_estimators': 500,
'min_samples_split': 3,
'min_samples_leaf': 1,
'max_depth': None,
}
forest = RandomForestClassifier()
forest.set_params(**our_best_param_grid)
predicted_f = forest.fit(x1_train, y1_train).predict(x1_test)
print("\nDF1:\n", classification_report(y1_test, predicted_f, target_names=["0", "1"]), "Accuracy:", accuracy_score(y1_test, predicted_f))
predicted_f = forest.fit(x2_train, y2_train).predict(x2_test)
print("\nDF2:\n", classification_report(y2_test, predicted_f, target_names=["0", "1"]), "Accuracy:", accuracy_score(y2_test, predicted_f))
predicted_f = forest.fit(x3_train, y3_train).predict(x3_test)
print("\nDF3:\n", classification_report(y3_test, predicted_f, target_names=["0", "1"]), "Accuracy:", accuracy_score(y3_test, predicted_f))
predicted_f = forest.fit(x4_train, y4_train).predict(x4_test)
print("\nDF4:\n", classification_report(y4_test, predicted_f, target_names=["0", "1"]), "Accuracy:", accuracy_score(y4_test, predicted_f))
predicted_f = forest.fit(x5_train, y5_train).predict(x5_test)
print("\nDF5:\n", classification_report(y5_test, predicted_f, target_names=["0", "1"]), "Accuracy:", accuracy_score(y5_test, predicted_f))
predicted_f = forest.fit(x6_train, y6_train).predict(x6_test)
print("\nDF6:\n", classification_report(y6_test, predicted_f, target_names=["0", "1"]), "Accuracy:", accuracy_score(y6_test, predicted_f))
DF1:
precision recall f1-score support
0 0.97 0.96 0.97 923
1 0.97 0.98 0.97 1112
accuracy 0.97 2035
macro avg 0.97 0.97 0.97 2035
weighted avg 0.97 0.97 0.97 2035
Accuracy: 0.9685503685503686
DF2:
precision recall f1-score support
0 0.96 0.95 0.96 957
1 0.96 0.97 0.96 1078
accuracy 0.96 2035
macro avg 0.96 0.96 0.96 2035
weighted avg 0.96 0.96 0.96 2035
Accuracy: 0.9597051597051597
DF3:
precision recall f1-score support
0 0.97 0.96 0.96 1038
1 0.96 0.97 0.97 1172
accuracy 0.97 2210
macro avg 0.97 0.96 0.97 2210
weighted avg 0.97 0.97 0.97 2210
Accuracy: 0.965158371040724
DF4:
precision recall f1-score support
0 0.99 0.97 0.98 1010
1 0.98 0.99 0.98 1147
accuracy 0.98 2157
macro avg 0.98 0.98 0.98 2157
weighted avg 0.98 0.98 0.98 2157
Accuracy: 0.980528511821975
DF5:
precision recall f1-score support
0 0.98 0.98 0.98 890
1 0.98 0.99 0.99 1145
accuracy 0.98 2035
macro avg 0.98 0.98 0.98 2035
weighted avg 0.98 0.98 0.98 2035
Accuracy: 0.9837837837837838
DF6:
precision recall f1-score support
0 0.98 0.98 0.98 1024
1 0.98 0.98 0.98 1186
accuracy 0.98 2210
macro avg 0.98 0.98 0.98 2210
weighted avg 0.98 0.98 0.98 2210
Accuracy: 0.9791855203619909
Výsledky z tohto merania sú také, že bez udelených parametrov to performuje trochu horšie ako s udelenými parametrami. Pri najlepších výsledok to bez udelených parametrov má accuracy 0.9832923832923833 ~ 0.9833 a s udelenými parametrami to má accuracy 0.9837837837837838 ~ 0.9838, čo je o trošičku lepšie.
Vyhodnotenie vplyvu vyskúšaných stratégií riešenia na klasifikáciu¶
Zistili sme, že najlepšie výsledky máme, keď prázdne hodnoty v dátach vyplníme pomocou KNN, vyhodíme z dát duplikáty a nahradíme outliers. Následne spravíme výber atribútov a spravíme transformáciu dát, kde nahradíme dáta reprezentované v stringoch číslami. Následne sme vyskúšali takto predspracované dáta nad klasifikátormi Decision Tree Classifier a Random Forest Classifier. Random Forest Classifier mal lepšie metriky, ale bol o niečo pomalší. Vybrali sme týmto pádom Random Forest Classifier a performoval lepšie s naším param gridom ako bez neho. Tieto výsledky definovali náš najlepší model a data pipeline.